diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9c5f9d5e6da9c743ddce67f54663f63faa793cbd --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba15411cc33e02beaa15d01b0dc2bad1d4653a12652d32e01a85cbf223851945 +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..73d802d6527357d89fd158ecd6262b6d1b7b8020 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4446fe7db6f20ab1976f53490bf25da8689a0eea599144c6e7354f72169a5cc2 +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e18931fc3e263f042ada9efe0f6859b341cec8df --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d0a0ee7d2d949f28df3cafd8f3988b330ff53cb47532e16178b6878208c6580 +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..24683bc5295e186ddc934a5c640693ffd855cc16 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/trainer_state.json @@ -0,0 +1,8434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3431022158684775, + "eval_steps": 500, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.49566685085696e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0c5e00469177aab28431635ef7e3405ad7ec2adf --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4802561027a6bd13186b649a6ae195cbed2a93f50d6727fd315de58649afe95f +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3ef487deddfd98345023af7f97907e385f6d6219 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63cd52bdb749083253500643953816ea44e611ce02be18104e97e87e2e1e560c +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..624fb6f231e4a75b3d3c170c3343f189fb06953b --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c1537ea93754bcbedf067c82267bba45efbc4cea3176f93bf6b3ee48356edb3 +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b4be751149515ab821e89988f784cbb2eddacebb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/trainer_state.json @@ -0,0 +1,9834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4002859185132237, + "eval_steps": 500, + "global_step": 14000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.74494465933312e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9378e420b006cd3dd52c55a41fed3aa0d7cc2e01 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f08092200aa83d65896aeeaa10a6d5baa0399bcde4d62b274e9a48f22061c012 +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..de4f8567d6274e82527d9717e30ffa1f77fac9dd --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1d843ebd5b7ff4460da57791fed1e41587e6cd78126a2c2b26078082ab90689 +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b3a11946ef7cb2352410164cffb064fddc4ff44 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22356a1854d3a91ba4553b002ab16cacd122e5087cde23ca6a9567324bd4c79c +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b27bcd0ecff7bd46b407d1b898f69e83c9a3e559 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/trainer_state.json @@ -0,0 +1,11234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.45746962115796996, + "eval_steps": 500, + "global_step": 16000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + }, + { + "epoch": 0.40057183702644744, + "grad_norm": 0.9077253937721252, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0415, + "step": 14010 + }, + { + "epoch": 0.4008577555396712, + "grad_norm": 0.9126781225204468, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.048, + "step": 14020 + }, + { + "epoch": 0.4011436740528949, + "grad_norm": 0.6264623999595642, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0411, + "step": 14030 + }, + { + "epoch": 0.40142959256611865, + "grad_norm": 0.523853600025177, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.051, + "step": 14040 + }, + { + "epoch": 0.4017155110793424, + "grad_norm": 0.6340035200119019, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0426, + "step": 14050 + }, + { + "epoch": 0.40200142959256613, + "grad_norm": 0.3594725430011749, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0397, + "step": 14060 + }, + { + "epoch": 0.40228734810578987, + "grad_norm": 0.941470742225647, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0402, + "step": 14070 + }, + { + "epoch": 0.4025732666190136, + "grad_norm": 0.840506911277771, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0473, + "step": 14080 + }, + { + "epoch": 0.4028591851322373, + "grad_norm": 0.3359200954437256, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0405, + "step": 14090 + }, + { + "epoch": 0.403145103645461, + "grad_norm": 0.49658629298210144, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0464, + "step": 14100 + }, + { + "epoch": 0.40343102215868476, + "grad_norm": 0.7940187454223633, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0417, + "step": 14110 + }, + { + "epoch": 0.4037169406719085, + "grad_norm": 0.30110660195350647, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0371, + "step": 14120 + }, + { + "epoch": 0.40400285918513223, + "grad_norm": 0.42845240235328674, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.053, + "step": 14130 + }, + { + "epoch": 0.40428877769835597, + "grad_norm": 0.997348427772522, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.041, + "step": 14140 + }, + { + "epoch": 0.4045746962115797, + "grad_norm": 0.4759966731071472, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0377, + "step": 14150 + }, + { + "epoch": 0.40486061472480345, + "grad_norm": 0.42045602202415466, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0397, + "step": 14160 + }, + { + "epoch": 0.4051465332380272, + "grad_norm": 0.6400002837181091, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0507, + "step": 14170 + }, + { + "epoch": 0.40543245175125087, + "grad_norm": 0.5473673939704895, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0359, + "step": 14180 + }, + { + "epoch": 0.4057183702644746, + "grad_norm": 0.7414730787277222, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0416, + "step": 14190 + }, + { + "epoch": 0.40600428877769834, + "grad_norm": 0.4691861867904663, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0363, + "step": 14200 + }, + { + "epoch": 0.4062902072909221, + "grad_norm": 0.9186112880706787, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0445, + "step": 14210 + }, + { + "epoch": 0.4065761258041458, + "grad_norm": 0.6782190203666687, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.40686204431736955, + "grad_norm": 0.6948013305664062, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.037, + "step": 14230 + }, + { + "epoch": 0.4071479628305933, + "grad_norm": 0.3034680485725403, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0371, + "step": 14240 + }, + { + "epoch": 0.40743388134381703, + "grad_norm": 0.4254174828529358, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0449, + "step": 14250 + }, + { + "epoch": 0.40771979985704077, + "grad_norm": 1.3622064590454102, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0428, + "step": 14260 + }, + { + "epoch": 0.40800571837026445, + "grad_norm": 0.5928359031677246, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0424, + "step": 14270 + }, + { + "epoch": 0.4082916368834882, + "grad_norm": 0.9103132486343384, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0414, + "step": 14280 + }, + { + "epoch": 0.4085775553967119, + "grad_norm": 0.6338028311729431, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0376, + "step": 14290 + }, + { + "epoch": 0.40886347390993566, + "grad_norm": 0.9920284748077393, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0393, + "step": 14300 + }, + { + "epoch": 0.4091493924231594, + "grad_norm": 0.411830335855484, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0336, + "step": 14310 + }, + { + "epoch": 0.40943531093638313, + "grad_norm": 0.6977682709693909, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0454, + "step": 14320 + }, + { + "epoch": 0.40972122944960687, + "grad_norm": 0.6303663849830627, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0453, + "step": 14330 + }, + { + "epoch": 0.4100071479628306, + "grad_norm": 0.3048207759857178, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0373, + "step": 14340 + }, + { + "epoch": 0.41029306647605435, + "grad_norm": 0.7683395743370056, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0438, + "step": 14350 + }, + { + "epoch": 0.41057898498927803, + "grad_norm": 0.5791511535644531, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0392, + "step": 14360 + }, + { + "epoch": 0.41086490350250177, + "grad_norm": 0.876626193523407, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0324, + "step": 14370 + }, + { + "epoch": 0.4111508220157255, + "grad_norm": 0.5971815586090088, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0368, + "step": 14380 + }, + { + "epoch": 0.41143674052894924, + "grad_norm": 0.6508862376213074, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0411, + "step": 14390 + }, + { + "epoch": 0.411722659042173, + "grad_norm": 0.4704359471797943, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0351, + "step": 14400 + }, + { + "epoch": 0.4120085775553967, + "grad_norm": 0.4266453683376312, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0367, + "step": 14410 + }, + { + "epoch": 0.41229449606862045, + "grad_norm": 0.5898434519767761, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0376, + "step": 14420 + }, + { + "epoch": 0.4125804145818442, + "grad_norm": 0.8741532564163208, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0419, + "step": 14430 + }, + { + "epoch": 0.41286633309506793, + "grad_norm": 0.24328190088272095, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0333, + "step": 14440 + }, + { + "epoch": 0.4131522516082916, + "grad_norm": 0.4263601303100586, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.039, + "step": 14450 + }, + { + "epoch": 0.41343817012151535, + "grad_norm": 0.6311615109443665, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0454, + "step": 14460 + }, + { + "epoch": 0.4137240886347391, + "grad_norm": 0.7424519658088684, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0392, + "step": 14470 + }, + { + "epoch": 0.4140100071479628, + "grad_norm": 0.48323145508766174, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0374, + "step": 14480 + }, + { + "epoch": 0.41429592566118656, + "grad_norm": 0.38597407937049866, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0393, + "step": 14490 + }, + { + "epoch": 0.4145818441744103, + "grad_norm": 0.7251518964767456, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0431, + "step": 14500 + }, + { + "epoch": 0.41486776268763403, + "grad_norm": 0.44361060857772827, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0426, + "step": 14510 + }, + { + "epoch": 0.41515368120085777, + "grad_norm": 0.5625014305114746, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0372, + "step": 14520 + }, + { + "epoch": 0.4154395997140815, + "grad_norm": 0.27855798602104187, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 0.4157255182273052, + "grad_norm": 0.5966296195983887, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0387, + "step": 14540 + }, + { + "epoch": 0.41601143674052893, + "grad_norm": 0.49445512890815735, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0355, + "step": 14550 + }, + { + "epoch": 0.41629735525375267, + "grad_norm": 0.3813278377056122, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0456, + "step": 14560 + }, + { + "epoch": 0.4165832737669764, + "grad_norm": 0.5962988138198853, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0401, + "step": 14570 + }, + { + "epoch": 0.41686919228020014, + "grad_norm": 0.4028547406196594, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0371, + "step": 14580 + }, + { + "epoch": 0.4171551107934239, + "grad_norm": 1.348706841468811, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0426, + "step": 14590 + }, + { + "epoch": 0.4174410293066476, + "grad_norm": 1.2782070636749268, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0393, + "step": 14600 + }, + { + "epoch": 0.41772694781987135, + "grad_norm": 1.0024999380111694, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0436, + "step": 14610 + }, + { + "epoch": 0.4180128663330951, + "grad_norm": 0.35450127720832825, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0411, + "step": 14620 + }, + { + "epoch": 0.41829878484631877, + "grad_norm": 0.5827250480651855, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0372, + "step": 14630 + }, + { + "epoch": 0.4185847033595425, + "grad_norm": 0.5905774235725403, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0394, + "step": 14640 + }, + { + "epoch": 0.41887062187276625, + "grad_norm": 0.652074933052063, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0405, + "step": 14650 + }, + { + "epoch": 0.41915654038599, + "grad_norm": 0.7245490550994873, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0473, + "step": 14660 + }, + { + "epoch": 0.4194424588992137, + "grad_norm": 0.5153012871742249, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.043, + "step": 14670 + }, + { + "epoch": 0.41972837741243746, + "grad_norm": 0.516107976436615, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0434, + "step": 14680 + }, + { + "epoch": 0.4200142959256612, + "grad_norm": 0.4743354618549347, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0429, + "step": 14690 + }, + { + "epoch": 0.42030021443888493, + "grad_norm": 0.547875165939331, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0395, + "step": 14700 + }, + { + "epoch": 0.42058613295210867, + "grad_norm": 0.6398400068283081, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0384, + "step": 14710 + }, + { + "epoch": 0.42087205146533235, + "grad_norm": 0.5891467332839966, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0399, + "step": 14720 + }, + { + "epoch": 0.4211579699785561, + "grad_norm": 0.3927595615386963, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0353, + "step": 14730 + }, + { + "epoch": 0.42144388849177983, + "grad_norm": 0.6477030515670776, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0492, + "step": 14740 + }, + { + "epoch": 0.42172980700500357, + "grad_norm": 0.7090615034103394, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.042, + "step": 14750 + }, + { + "epoch": 0.4220157255182273, + "grad_norm": 0.6572134494781494, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0406, + "step": 14760 + }, + { + "epoch": 0.42230164403145104, + "grad_norm": 0.787663996219635, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0424, + "step": 14770 + }, + { + "epoch": 0.4225875625446748, + "grad_norm": 0.8419309258460999, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0427, + "step": 14780 + }, + { + "epoch": 0.4228734810578985, + "grad_norm": 0.6204128861427307, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0364, + "step": 14790 + }, + { + "epoch": 0.42315939957112225, + "grad_norm": 0.7446070313453674, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 0.42344531808434593, + "grad_norm": 0.7446451783180237, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0384, + "step": 14810 + }, + { + "epoch": 0.42373123659756967, + "grad_norm": 0.6946475505828857, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0375, + "step": 14820 + }, + { + "epoch": 0.4240171551107934, + "grad_norm": 0.6997008323669434, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0393, + "step": 14830 + }, + { + "epoch": 0.42430307362401715, + "grad_norm": 0.4857316315174103, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0474, + "step": 14840 + }, + { + "epoch": 0.4245889921372409, + "grad_norm": 1.3516888618469238, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.047, + "step": 14850 + }, + { + "epoch": 0.4248749106504646, + "grad_norm": 0.40320220589637756, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0418, + "step": 14860 + }, + { + "epoch": 0.42516082916368836, + "grad_norm": 0.9002796411514282, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0434, + "step": 14870 + }, + { + "epoch": 0.4254467476769121, + "grad_norm": 0.3810071349143982, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0338, + "step": 14880 + }, + { + "epoch": 0.42573266619013583, + "grad_norm": 0.5786157250404358, + "learning_rate": 1.159527607963768e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.4260185847033595, + "grad_norm": 0.6316869258880615, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0388, + "step": 14900 + }, + { + "epoch": 0.42630450321658325, + "grad_norm": 0.608745276927948, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0426, + "step": 14910 + }, + { + "epoch": 0.426590421729807, + "grad_norm": 0.6655036807060242, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0433, + "step": 14920 + }, + { + "epoch": 0.4268763402430307, + "grad_norm": 0.29059523344039917, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0507, + "step": 14930 + }, + { + "epoch": 0.42716225875625446, + "grad_norm": 0.9066076278686523, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.4274481772694782, + "grad_norm": 1.0660220384597778, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0512, + "step": 14950 + }, + { + "epoch": 0.42773409578270194, + "grad_norm": 0.6081144213676453, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0426, + "step": 14960 + }, + { + "epoch": 0.4280200142959257, + "grad_norm": 0.46524369716644287, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0435, + "step": 14970 + }, + { + "epoch": 0.4283059328091494, + "grad_norm": 0.3497388958930969, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0492, + "step": 14980 + }, + { + "epoch": 0.4285918513223731, + "grad_norm": 0.41300803422927856, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 0.42887776983559683, + "grad_norm": 0.4363289177417755, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0358, + "step": 15000 + }, + { + "epoch": 0.42916368834882057, + "grad_norm": 1.314915418624878, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.047, + "step": 15010 + }, + { + "epoch": 0.4294496068620443, + "grad_norm": 0.558199942111969, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0313, + "step": 15020 + }, + { + "epoch": 0.42973552537526805, + "grad_norm": 0.3857463598251343, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0416, + "step": 15030 + }, + { + "epoch": 0.4300214438884918, + "grad_norm": 0.4701749384403229, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0425, + "step": 15040 + }, + { + "epoch": 0.4303073624017155, + "grad_norm": 0.4611213803291321, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0457, + "step": 15050 + }, + { + "epoch": 0.43059328091493926, + "grad_norm": 0.5338016152381897, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.038, + "step": 15060 + }, + { + "epoch": 0.430879199428163, + "grad_norm": 0.9078943133354187, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0395, + "step": 15070 + }, + { + "epoch": 0.4311651179413867, + "grad_norm": 0.5354048013687134, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0403, + "step": 15080 + }, + { + "epoch": 0.4314510364546104, + "grad_norm": 0.35511279106140137, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0377, + "step": 15090 + }, + { + "epoch": 0.43173695496783415, + "grad_norm": 0.37104350328445435, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0426, + "step": 15100 + }, + { + "epoch": 0.4320228734810579, + "grad_norm": 0.8916210532188416, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0387, + "step": 15110 + }, + { + "epoch": 0.4323087919942816, + "grad_norm": 0.514994740486145, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0384, + "step": 15120 + }, + { + "epoch": 0.43259471050750536, + "grad_norm": 0.8440690040588379, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0437, + "step": 15130 + }, + { + "epoch": 0.4328806290207291, + "grad_norm": 0.6815949082374573, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0453, + "step": 15140 + }, + { + "epoch": 0.43316654753395284, + "grad_norm": 0.33178189396858215, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0351, + "step": 15150 + }, + { + "epoch": 0.4334524660471766, + "grad_norm": 0.5686727166175842, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0368, + "step": 15160 + }, + { + "epoch": 0.43373838456040026, + "grad_norm": 0.44143930077552795, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0443, + "step": 15170 + }, + { + "epoch": 0.434024303073624, + "grad_norm": 0.3238232135772705, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0348, + "step": 15180 + }, + { + "epoch": 0.43431022158684773, + "grad_norm": 0.5038242340087891, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0343, + "step": 15190 + }, + { + "epoch": 0.43459614010007147, + "grad_norm": 0.4904351234436035, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0397, + "step": 15200 + }, + { + "epoch": 0.4348820586132952, + "grad_norm": 0.5325750708580017, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0499, + "step": 15210 + }, + { + "epoch": 0.43516797712651895, + "grad_norm": 0.39443954825401306, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 0.4354538956397427, + "grad_norm": 0.6782003045082092, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0358, + "step": 15230 + }, + { + "epoch": 0.4357398141529664, + "grad_norm": 0.47862571477890015, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0418, + "step": 15240 + }, + { + "epoch": 0.43602573266619016, + "grad_norm": 1.6515535116195679, + "learning_rate": 1.124468908014616e-05, + "loss": 0.043, + "step": 15250 + }, + { + "epoch": 0.43631165117941384, + "grad_norm": 0.4902660846710205, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0371, + "step": 15260 + }, + { + "epoch": 0.4365975696926376, + "grad_norm": 0.5742762088775635, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0369, + "step": 15270 + }, + { + "epoch": 0.4368834882058613, + "grad_norm": 0.42058590054512024, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0378, + "step": 15280 + }, + { + "epoch": 0.43716940671908505, + "grad_norm": 0.43729284405708313, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0352, + "step": 15290 + }, + { + "epoch": 0.4374553252323088, + "grad_norm": 0.4689466953277588, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0433, + "step": 15300 + }, + { + "epoch": 0.4377412437455325, + "grad_norm": 0.6272432208061218, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0548, + "step": 15310 + }, + { + "epoch": 0.43802716225875626, + "grad_norm": 1.1129611730575562, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0437, + "step": 15320 + }, + { + "epoch": 0.43831308077198, + "grad_norm": 0.9332655072212219, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0503, + "step": 15330 + }, + { + "epoch": 0.43859899928520374, + "grad_norm": 0.35150477290153503, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0351, + "step": 15340 + }, + { + "epoch": 0.4388849177984274, + "grad_norm": 0.3826565444469452, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0361, + "step": 15350 + }, + { + "epoch": 0.43917083631165116, + "grad_norm": 0.817319393157959, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0352, + "step": 15360 + }, + { + "epoch": 0.4394567548248749, + "grad_norm": 0.4379598796367645, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0469, + "step": 15370 + }, + { + "epoch": 0.43974267333809863, + "grad_norm": 0.6475314497947693, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0456, + "step": 15380 + }, + { + "epoch": 0.44002859185132237, + "grad_norm": 0.529088020324707, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0453, + "step": 15390 + }, + { + "epoch": 0.4403145103645461, + "grad_norm": 0.4915194809436798, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0369, + "step": 15400 + }, + { + "epoch": 0.44060042887776985, + "grad_norm": 0.4766380786895752, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0391, + "step": 15410 + }, + { + "epoch": 0.4408863473909936, + "grad_norm": 0.34667786955833435, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0327, + "step": 15420 + }, + { + "epoch": 0.4411722659042173, + "grad_norm": 0.504242479801178, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0413, + "step": 15430 + }, + { + "epoch": 0.441458184417441, + "grad_norm": 0.49786439538002014, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0361, + "step": 15440 + }, + { + "epoch": 0.44174410293066474, + "grad_norm": 0.4997329115867615, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0368, + "step": 15450 + }, + { + "epoch": 0.4420300214438885, + "grad_norm": 0.2992185056209564, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0359, + "step": 15460 + }, + { + "epoch": 0.4423159399571122, + "grad_norm": 0.6645393371582031, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0401, + "step": 15470 + }, + { + "epoch": 0.44260185847033595, + "grad_norm": 0.6327983140945435, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0386, + "step": 15480 + }, + { + "epoch": 0.4428877769835597, + "grad_norm": 0.45607903599739075, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 0.4431736954967834, + "grad_norm": 0.4401610493659973, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0417, + "step": 15500 + }, + { + "epoch": 0.44345961401000716, + "grad_norm": 0.5778466463088989, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0417, + "step": 15510 + }, + { + "epoch": 0.4437455325232309, + "grad_norm": 0.2164914309978485, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0355, + "step": 15520 + }, + { + "epoch": 0.4440314510364546, + "grad_norm": 0.3869318664073944, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0361, + "step": 15530 + }, + { + "epoch": 0.4443173695496783, + "grad_norm": 0.3843154311180115, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0459, + "step": 15540 + }, + { + "epoch": 0.44460328806290206, + "grad_norm": 0.8488825559616089, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0406, + "step": 15550 + }, + { + "epoch": 0.4448892065761258, + "grad_norm": 0.5055183172225952, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0359, + "step": 15560 + }, + { + "epoch": 0.44517512508934953, + "grad_norm": 0.40923011302948, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0435, + "step": 15570 + }, + { + "epoch": 0.44546104360257327, + "grad_norm": 0.48997730016708374, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0395, + "step": 15580 + }, + { + "epoch": 0.445746962115797, + "grad_norm": 0.5149131417274475, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.041, + "step": 15590 + }, + { + "epoch": 0.44603288062902074, + "grad_norm": 0.7277303338050842, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0452, + "step": 15600 + }, + { + "epoch": 0.4463187991422445, + "grad_norm": 0.48676377534866333, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0363, + "step": 15610 + }, + { + "epoch": 0.44660471765546816, + "grad_norm": 0.49031221866607666, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0356, + "step": 15620 + }, + { + "epoch": 0.4468906361686919, + "grad_norm": 0.38877514004707336, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.036, + "step": 15630 + }, + { + "epoch": 0.44717655468191564, + "grad_norm": 0.570068895816803, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0403, + "step": 15640 + }, + { + "epoch": 0.4474624731951394, + "grad_norm": 0.48499882221221924, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0395, + "step": 15650 + }, + { + "epoch": 0.4477483917083631, + "grad_norm": 0.7251732349395752, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0399, + "step": 15660 + }, + { + "epoch": 0.44803431022158685, + "grad_norm": 0.3927334249019623, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0359, + "step": 15670 + }, + { + "epoch": 0.4483202287348106, + "grad_norm": 0.5614549517631531, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.035, + "step": 15680 + }, + { + "epoch": 0.4486061472480343, + "grad_norm": 0.383831262588501, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0416, + "step": 15690 + }, + { + "epoch": 0.44889206576125806, + "grad_norm": 1.9365276098251343, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0498, + "step": 15700 + }, + { + "epoch": 0.44917798427448175, + "grad_norm": 0.6964924931526184, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.034, + "step": 15710 + }, + { + "epoch": 0.4494639027877055, + "grad_norm": 0.5148108601570129, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0401, + "step": 15720 + }, + { + "epoch": 0.4497498213009292, + "grad_norm": 0.4529317617416382, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0361, + "step": 15730 + }, + { + "epoch": 0.45003573981415296, + "grad_norm": 0.6648512482643127, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0365, + "step": 15740 + }, + { + "epoch": 0.4503216583273767, + "grad_norm": 0.8183113932609558, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0416, + "step": 15750 + }, + { + "epoch": 0.45060757684060043, + "grad_norm": 0.8802638649940491, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0406, + "step": 15760 + }, + { + "epoch": 0.45089349535382417, + "grad_norm": 0.6329004764556885, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0395, + "step": 15770 + }, + { + "epoch": 0.4511794138670479, + "grad_norm": 0.35283520817756653, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0364, + "step": 15780 + }, + { + "epoch": 0.45146533238027164, + "grad_norm": 0.5156061053276062, + "learning_rate": 1.071827766589186e-05, + "loss": 0.031, + "step": 15790 + }, + { + "epoch": 0.4517512508934953, + "grad_norm": 0.37875205278396606, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0375, + "step": 15800 + }, + { + "epoch": 0.45203716940671906, + "grad_norm": 0.5543273687362671, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0421, + "step": 15810 + }, + { + "epoch": 0.4523230879199428, + "grad_norm": 0.3808431923389435, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 0.45260900643316654, + "grad_norm": 0.8648643493652344, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0396, + "step": 15830 + }, + { + "epoch": 0.4528949249463903, + "grad_norm": 0.7893536686897278, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0417, + "step": 15840 + }, + { + "epoch": 0.453180843459614, + "grad_norm": 0.904137134552002, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0384, + "step": 15850 + }, + { + "epoch": 0.45346676197283775, + "grad_norm": 0.6095889806747437, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0457, + "step": 15860 + }, + { + "epoch": 0.4537526804860615, + "grad_norm": 0.5691415667533875, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0438, + "step": 15870 + }, + { + "epoch": 0.4540385989992852, + "grad_norm": 0.37868618965148926, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0414, + "step": 15880 + }, + { + "epoch": 0.4543245175125089, + "grad_norm": 0.7962950468063354, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0405, + "step": 15890 + }, + { + "epoch": 0.45461043602573264, + "grad_norm": 0.8862378597259521, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0475, + "step": 15900 + }, + { + "epoch": 0.4548963545389564, + "grad_norm": 0.8762509822845459, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0472, + "step": 15910 + }, + { + "epoch": 0.4551822730521801, + "grad_norm": 0.6006313562393188, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0417, + "step": 15920 + }, + { + "epoch": 0.45546819156540386, + "grad_norm": 0.3340131938457489, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0374, + "step": 15930 + }, + { + "epoch": 0.4557541100786276, + "grad_norm": 0.2639552056789398, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0387, + "step": 15940 + }, + { + "epoch": 0.45604002859185133, + "grad_norm": 0.42564907670021057, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0376, + "step": 15950 + }, + { + "epoch": 0.45632594710507507, + "grad_norm": 0.503834068775177, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0344, + "step": 15960 + }, + { + "epoch": 0.4566118656182988, + "grad_norm": 0.5962334871292114, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0379, + "step": 15970 + }, + { + "epoch": 0.4568977841315225, + "grad_norm": 0.3271556794643402, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0361, + "step": 15980 + }, + { + "epoch": 0.4571837026447462, + "grad_norm": 0.5501612424850464, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0356, + "step": 15990 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 1.0399914979934692, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.039, + "step": 16000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.99422246780928e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d1eef59f2babbacd99d7bfa2c8d1499b24ebc8ee --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9636d34e3f65e9ea6c5930ff4f4a14c1bc9be898c76f682ae441e98113508ac1 +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48e69ede054488b3fb9d8a1d63cfa93907639b6b --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eecf415f38bbd674619b4a8e24a560e9672eefb9bf9946ae4123fbed2f6bbb98 +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea1b8abdb07d6ec912e24b692528875c31b1d5ae --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:465fcaf22dc6a96994cd2858925ce1aa7173d06e62814670f94b4fef0c31c66a +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..74f10a9b762650dab7f788faf2a4e7f69417a8e6 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/trainer_state.json @@ -0,0 +1,12634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5146533238027162, + "eval_steps": 500, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + }, + { + "epoch": 0.40057183702644744, + "grad_norm": 0.9077253937721252, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0415, + "step": 14010 + }, + { + "epoch": 0.4008577555396712, + "grad_norm": 0.9126781225204468, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.048, + "step": 14020 + }, + { + "epoch": 0.4011436740528949, + "grad_norm": 0.6264623999595642, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0411, + "step": 14030 + }, + { + "epoch": 0.40142959256611865, + "grad_norm": 0.523853600025177, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.051, + "step": 14040 + }, + { + "epoch": 0.4017155110793424, + "grad_norm": 0.6340035200119019, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0426, + "step": 14050 + }, + { + "epoch": 0.40200142959256613, + "grad_norm": 0.3594725430011749, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0397, + "step": 14060 + }, + { + "epoch": 0.40228734810578987, + "grad_norm": 0.941470742225647, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0402, + "step": 14070 + }, + { + "epoch": 0.4025732666190136, + "grad_norm": 0.840506911277771, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0473, + "step": 14080 + }, + { + "epoch": 0.4028591851322373, + "grad_norm": 0.3359200954437256, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0405, + "step": 14090 + }, + { + "epoch": 0.403145103645461, + "grad_norm": 0.49658629298210144, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0464, + "step": 14100 + }, + { + "epoch": 0.40343102215868476, + "grad_norm": 0.7940187454223633, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0417, + "step": 14110 + }, + { + "epoch": 0.4037169406719085, + "grad_norm": 0.30110660195350647, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0371, + "step": 14120 + }, + { + "epoch": 0.40400285918513223, + "grad_norm": 0.42845240235328674, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.053, + "step": 14130 + }, + { + "epoch": 0.40428877769835597, + "grad_norm": 0.997348427772522, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.041, + "step": 14140 + }, + { + "epoch": 0.4045746962115797, + "grad_norm": 0.4759966731071472, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0377, + "step": 14150 + }, + { + "epoch": 0.40486061472480345, + "grad_norm": 0.42045602202415466, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0397, + "step": 14160 + }, + { + "epoch": 0.4051465332380272, + "grad_norm": 0.6400002837181091, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0507, + "step": 14170 + }, + { + "epoch": 0.40543245175125087, + "grad_norm": 0.5473673939704895, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0359, + "step": 14180 + }, + { + "epoch": 0.4057183702644746, + "grad_norm": 0.7414730787277222, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0416, + "step": 14190 + }, + { + "epoch": 0.40600428877769834, + "grad_norm": 0.4691861867904663, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0363, + "step": 14200 + }, + { + "epoch": 0.4062902072909221, + "grad_norm": 0.9186112880706787, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0445, + "step": 14210 + }, + { + "epoch": 0.4065761258041458, + "grad_norm": 0.6782190203666687, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.40686204431736955, + "grad_norm": 0.6948013305664062, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.037, + "step": 14230 + }, + { + "epoch": 0.4071479628305933, + "grad_norm": 0.3034680485725403, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0371, + "step": 14240 + }, + { + "epoch": 0.40743388134381703, + "grad_norm": 0.4254174828529358, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0449, + "step": 14250 + }, + { + "epoch": 0.40771979985704077, + "grad_norm": 1.3622064590454102, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0428, + "step": 14260 + }, + { + "epoch": 0.40800571837026445, + "grad_norm": 0.5928359031677246, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0424, + "step": 14270 + }, + { + "epoch": 0.4082916368834882, + "grad_norm": 0.9103132486343384, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0414, + "step": 14280 + }, + { + "epoch": 0.4085775553967119, + "grad_norm": 0.6338028311729431, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0376, + "step": 14290 + }, + { + "epoch": 0.40886347390993566, + "grad_norm": 0.9920284748077393, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0393, + "step": 14300 + }, + { + "epoch": 0.4091493924231594, + "grad_norm": 0.411830335855484, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0336, + "step": 14310 + }, + { + "epoch": 0.40943531093638313, + "grad_norm": 0.6977682709693909, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0454, + "step": 14320 + }, + { + "epoch": 0.40972122944960687, + "grad_norm": 0.6303663849830627, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0453, + "step": 14330 + }, + { + "epoch": 0.4100071479628306, + "grad_norm": 0.3048207759857178, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0373, + "step": 14340 + }, + { + "epoch": 0.41029306647605435, + "grad_norm": 0.7683395743370056, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0438, + "step": 14350 + }, + { + "epoch": 0.41057898498927803, + "grad_norm": 0.5791511535644531, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0392, + "step": 14360 + }, + { + "epoch": 0.41086490350250177, + "grad_norm": 0.876626193523407, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0324, + "step": 14370 + }, + { + "epoch": 0.4111508220157255, + "grad_norm": 0.5971815586090088, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0368, + "step": 14380 + }, + { + "epoch": 0.41143674052894924, + "grad_norm": 0.6508862376213074, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0411, + "step": 14390 + }, + { + "epoch": 0.411722659042173, + "grad_norm": 0.4704359471797943, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0351, + "step": 14400 + }, + { + "epoch": 0.4120085775553967, + "grad_norm": 0.4266453683376312, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0367, + "step": 14410 + }, + { + "epoch": 0.41229449606862045, + "grad_norm": 0.5898434519767761, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0376, + "step": 14420 + }, + { + "epoch": 0.4125804145818442, + "grad_norm": 0.8741532564163208, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0419, + "step": 14430 + }, + { + "epoch": 0.41286633309506793, + "grad_norm": 0.24328190088272095, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0333, + "step": 14440 + }, + { + "epoch": 0.4131522516082916, + "grad_norm": 0.4263601303100586, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.039, + "step": 14450 + }, + { + "epoch": 0.41343817012151535, + "grad_norm": 0.6311615109443665, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0454, + "step": 14460 + }, + { + "epoch": 0.4137240886347391, + "grad_norm": 0.7424519658088684, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0392, + "step": 14470 + }, + { + "epoch": 0.4140100071479628, + "grad_norm": 0.48323145508766174, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0374, + "step": 14480 + }, + { + "epoch": 0.41429592566118656, + "grad_norm": 0.38597407937049866, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0393, + "step": 14490 + }, + { + "epoch": 0.4145818441744103, + "grad_norm": 0.7251518964767456, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0431, + "step": 14500 + }, + { + "epoch": 0.41486776268763403, + "grad_norm": 0.44361060857772827, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0426, + "step": 14510 + }, + { + "epoch": 0.41515368120085777, + "grad_norm": 0.5625014305114746, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0372, + "step": 14520 + }, + { + "epoch": 0.4154395997140815, + "grad_norm": 0.27855798602104187, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 0.4157255182273052, + "grad_norm": 0.5966296195983887, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0387, + "step": 14540 + }, + { + "epoch": 0.41601143674052893, + "grad_norm": 0.49445512890815735, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0355, + "step": 14550 + }, + { + "epoch": 0.41629735525375267, + "grad_norm": 0.3813278377056122, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0456, + "step": 14560 + }, + { + "epoch": 0.4165832737669764, + "grad_norm": 0.5962988138198853, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0401, + "step": 14570 + }, + { + "epoch": 0.41686919228020014, + "grad_norm": 0.4028547406196594, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0371, + "step": 14580 + }, + { + "epoch": 0.4171551107934239, + "grad_norm": 1.348706841468811, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0426, + "step": 14590 + }, + { + "epoch": 0.4174410293066476, + "grad_norm": 1.2782070636749268, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0393, + "step": 14600 + }, + { + "epoch": 0.41772694781987135, + "grad_norm": 1.0024999380111694, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0436, + "step": 14610 + }, + { + "epoch": 0.4180128663330951, + "grad_norm": 0.35450127720832825, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0411, + "step": 14620 + }, + { + "epoch": 0.41829878484631877, + "grad_norm": 0.5827250480651855, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0372, + "step": 14630 + }, + { + "epoch": 0.4185847033595425, + "grad_norm": 0.5905774235725403, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0394, + "step": 14640 + }, + { + "epoch": 0.41887062187276625, + "grad_norm": 0.652074933052063, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0405, + "step": 14650 + }, + { + "epoch": 0.41915654038599, + "grad_norm": 0.7245490550994873, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0473, + "step": 14660 + }, + { + "epoch": 0.4194424588992137, + "grad_norm": 0.5153012871742249, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.043, + "step": 14670 + }, + { + "epoch": 0.41972837741243746, + "grad_norm": 0.516107976436615, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0434, + "step": 14680 + }, + { + "epoch": 0.4200142959256612, + "grad_norm": 0.4743354618549347, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0429, + "step": 14690 + }, + { + "epoch": 0.42030021443888493, + "grad_norm": 0.547875165939331, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0395, + "step": 14700 + }, + { + "epoch": 0.42058613295210867, + "grad_norm": 0.6398400068283081, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0384, + "step": 14710 + }, + { + "epoch": 0.42087205146533235, + "grad_norm": 0.5891467332839966, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0399, + "step": 14720 + }, + { + "epoch": 0.4211579699785561, + "grad_norm": 0.3927595615386963, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0353, + "step": 14730 + }, + { + "epoch": 0.42144388849177983, + "grad_norm": 0.6477030515670776, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0492, + "step": 14740 + }, + { + "epoch": 0.42172980700500357, + "grad_norm": 0.7090615034103394, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.042, + "step": 14750 + }, + { + "epoch": 0.4220157255182273, + "grad_norm": 0.6572134494781494, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0406, + "step": 14760 + }, + { + "epoch": 0.42230164403145104, + "grad_norm": 0.787663996219635, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0424, + "step": 14770 + }, + { + "epoch": 0.4225875625446748, + "grad_norm": 0.8419309258460999, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0427, + "step": 14780 + }, + { + "epoch": 0.4228734810578985, + "grad_norm": 0.6204128861427307, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0364, + "step": 14790 + }, + { + "epoch": 0.42315939957112225, + "grad_norm": 0.7446070313453674, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 0.42344531808434593, + "grad_norm": 0.7446451783180237, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0384, + "step": 14810 + }, + { + "epoch": 0.42373123659756967, + "grad_norm": 0.6946475505828857, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0375, + "step": 14820 + }, + { + "epoch": 0.4240171551107934, + "grad_norm": 0.6997008323669434, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0393, + "step": 14830 + }, + { + "epoch": 0.42430307362401715, + "grad_norm": 0.4857316315174103, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0474, + "step": 14840 + }, + { + "epoch": 0.4245889921372409, + "grad_norm": 1.3516888618469238, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.047, + "step": 14850 + }, + { + "epoch": 0.4248749106504646, + "grad_norm": 0.40320220589637756, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0418, + "step": 14860 + }, + { + "epoch": 0.42516082916368836, + "grad_norm": 0.9002796411514282, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0434, + "step": 14870 + }, + { + "epoch": 0.4254467476769121, + "grad_norm": 0.3810071349143982, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0338, + "step": 14880 + }, + { + "epoch": 0.42573266619013583, + "grad_norm": 0.5786157250404358, + "learning_rate": 1.159527607963768e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.4260185847033595, + "grad_norm": 0.6316869258880615, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0388, + "step": 14900 + }, + { + "epoch": 0.42630450321658325, + "grad_norm": 0.608745276927948, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0426, + "step": 14910 + }, + { + "epoch": 0.426590421729807, + "grad_norm": 0.6655036807060242, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0433, + "step": 14920 + }, + { + "epoch": 0.4268763402430307, + "grad_norm": 0.29059523344039917, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0507, + "step": 14930 + }, + { + "epoch": 0.42716225875625446, + "grad_norm": 0.9066076278686523, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.4274481772694782, + "grad_norm": 1.0660220384597778, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0512, + "step": 14950 + }, + { + "epoch": 0.42773409578270194, + "grad_norm": 0.6081144213676453, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0426, + "step": 14960 + }, + { + "epoch": 0.4280200142959257, + "grad_norm": 0.46524369716644287, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0435, + "step": 14970 + }, + { + "epoch": 0.4283059328091494, + "grad_norm": 0.3497388958930969, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0492, + "step": 14980 + }, + { + "epoch": 0.4285918513223731, + "grad_norm": 0.41300803422927856, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 0.42887776983559683, + "grad_norm": 0.4363289177417755, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0358, + "step": 15000 + }, + { + "epoch": 0.42916368834882057, + "grad_norm": 1.314915418624878, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.047, + "step": 15010 + }, + { + "epoch": 0.4294496068620443, + "grad_norm": 0.558199942111969, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0313, + "step": 15020 + }, + { + "epoch": 0.42973552537526805, + "grad_norm": 0.3857463598251343, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0416, + "step": 15030 + }, + { + "epoch": 0.4300214438884918, + "grad_norm": 0.4701749384403229, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0425, + "step": 15040 + }, + { + "epoch": 0.4303073624017155, + "grad_norm": 0.4611213803291321, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0457, + "step": 15050 + }, + { + "epoch": 0.43059328091493926, + "grad_norm": 0.5338016152381897, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.038, + "step": 15060 + }, + { + "epoch": 0.430879199428163, + "grad_norm": 0.9078943133354187, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0395, + "step": 15070 + }, + { + "epoch": 0.4311651179413867, + "grad_norm": 0.5354048013687134, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0403, + "step": 15080 + }, + { + "epoch": 0.4314510364546104, + "grad_norm": 0.35511279106140137, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0377, + "step": 15090 + }, + { + "epoch": 0.43173695496783415, + "grad_norm": 0.37104350328445435, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0426, + "step": 15100 + }, + { + "epoch": 0.4320228734810579, + "grad_norm": 0.8916210532188416, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0387, + "step": 15110 + }, + { + "epoch": 0.4323087919942816, + "grad_norm": 0.514994740486145, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0384, + "step": 15120 + }, + { + "epoch": 0.43259471050750536, + "grad_norm": 0.8440690040588379, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0437, + "step": 15130 + }, + { + "epoch": 0.4328806290207291, + "grad_norm": 0.6815949082374573, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0453, + "step": 15140 + }, + { + "epoch": 0.43316654753395284, + "grad_norm": 0.33178189396858215, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0351, + "step": 15150 + }, + { + "epoch": 0.4334524660471766, + "grad_norm": 0.5686727166175842, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0368, + "step": 15160 + }, + { + "epoch": 0.43373838456040026, + "grad_norm": 0.44143930077552795, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0443, + "step": 15170 + }, + { + "epoch": 0.434024303073624, + "grad_norm": 0.3238232135772705, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0348, + "step": 15180 + }, + { + "epoch": 0.43431022158684773, + "grad_norm": 0.5038242340087891, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0343, + "step": 15190 + }, + { + "epoch": 0.43459614010007147, + "grad_norm": 0.4904351234436035, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0397, + "step": 15200 + }, + { + "epoch": 0.4348820586132952, + "grad_norm": 0.5325750708580017, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0499, + "step": 15210 + }, + { + "epoch": 0.43516797712651895, + "grad_norm": 0.39443954825401306, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 0.4354538956397427, + "grad_norm": 0.6782003045082092, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0358, + "step": 15230 + }, + { + "epoch": 0.4357398141529664, + "grad_norm": 0.47862571477890015, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0418, + "step": 15240 + }, + { + "epoch": 0.43602573266619016, + "grad_norm": 1.6515535116195679, + "learning_rate": 1.124468908014616e-05, + "loss": 0.043, + "step": 15250 + }, + { + "epoch": 0.43631165117941384, + "grad_norm": 0.4902660846710205, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0371, + "step": 15260 + }, + { + "epoch": 0.4365975696926376, + "grad_norm": 0.5742762088775635, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0369, + "step": 15270 + }, + { + "epoch": 0.4368834882058613, + "grad_norm": 0.42058590054512024, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0378, + "step": 15280 + }, + { + "epoch": 0.43716940671908505, + "grad_norm": 0.43729284405708313, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0352, + "step": 15290 + }, + { + "epoch": 0.4374553252323088, + "grad_norm": 0.4689466953277588, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0433, + "step": 15300 + }, + { + "epoch": 0.4377412437455325, + "grad_norm": 0.6272432208061218, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0548, + "step": 15310 + }, + { + "epoch": 0.43802716225875626, + "grad_norm": 1.1129611730575562, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0437, + "step": 15320 + }, + { + "epoch": 0.43831308077198, + "grad_norm": 0.9332655072212219, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0503, + "step": 15330 + }, + { + "epoch": 0.43859899928520374, + "grad_norm": 0.35150477290153503, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0351, + "step": 15340 + }, + { + "epoch": 0.4388849177984274, + "grad_norm": 0.3826565444469452, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0361, + "step": 15350 + }, + { + "epoch": 0.43917083631165116, + "grad_norm": 0.817319393157959, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0352, + "step": 15360 + }, + { + "epoch": 0.4394567548248749, + "grad_norm": 0.4379598796367645, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0469, + "step": 15370 + }, + { + "epoch": 0.43974267333809863, + "grad_norm": 0.6475314497947693, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0456, + "step": 15380 + }, + { + "epoch": 0.44002859185132237, + "grad_norm": 0.529088020324707, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0453, + "step": 15390 + }, + { + "epoch": 0.4403145103645461, + "grad_norm": 0.4915194809436798, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0369, + "step": 15400 + }, + { + "epoch": 0.44060042887776985, + "grad_norm": 0.4766380786895752, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0391, + "step": 15410 + }, + { + "epoch": 0.4408863473909936, + "grad_norm": 0.34667786955833435, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0327, + "step": 15420 + }, + { + "epoch": 0.4411722659042173, + "grad_norm": 0.504242479801178, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0413, + "step": 15430 + }, + { + "epoch": 0.441458184417441, + "grad_norm": 0.49786439538002014, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0361, + "step": 15440 + }, + { + "epoch": 0.44174410293066474, + "grad_norm": 0.4997329115867615, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0368, + "step": 15450 + }, + { + "epoch": 0.4420300214438885, + "grad_norm": 0.2992185056209564, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0359, + "step": 15460 + }, + { + "epoch": 0.4423159399571122, + "grad_norm": 0.6645393371582031, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0401, + "step": 15470 + }, + { + "epoch": 0.44260185847033595, + "grad_norm": 0.6327983140945435, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0386, + "step": 15480 + }, + { + "epoch": 0.4428877769835597, + "grad_norm": 0.45607903599739075, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 0.4431736954967834, + "grad_norm": 0.4401610493659973, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0417, + "step": 15500 + }, + { + "epoch": 0.44345961401000716, + "grad_norm": 0.5778466463088989, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0417, + "step": 15510 + }, + { + "epoch": 0.4437455325232309, + "grad_norm": 0.2164914309978485, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0355, + "step": 15520 + }, + { + "epoch": 0.4440314510364546, + "grad_norm": 0.3869318664073944, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0361, + "step": 15530 + }, + { + "epoch": 0.4443173695496783, + "grad_norm": 0.3843154311180115, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0459, + "step": 15540 + }, + { + "epoch": 0.44460328806290206, + "grad_norm": 0.8488825559616089, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0406, + "step": 15550 + }, + { + "epoch": 0.4448892065761258, + "grad_norm": 0.5055183172225952, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0359, + "step": 15560 + }, + { + "epoch": 0.44517512508934953, + "grad_norm": 0.40923011302948, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0435, + "step": 15570 + }, + { + "epoch": 0.44546104360257327, + "grad_norm": 0.48997730016708374, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0395, + "step": 15580 + }, + { + "epoch": 0.445746962115797, + "grad_norm": 0.5149131417274475, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.041, + "step": 15590 + }, + { + "epoch": 0.44603288062902074, + "grad_norm": 0.7277303338050842, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0452, + "step": 15600 + }, + { + "epoch": 0.4463187991422445, + "grad_norm": 0.48676377534866333, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0363, + "step": 15610 + }, + { + "epoch": 0.44660471765546816, + "grad_norm": 0.49031221866607666, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0356, + "step": 15620 + }, + { + "epoch": 0.4468906361686919, + "grad_norm": 0.38877514004707336, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.036, + "step": 15630 + }, + { + "epoch": 0.44717655468191564, + "grad_norm": 0.570068895816803, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0403, + "step": 15640 + }, + { + "epoch": 0.4474624731951394, + "grad_norm": 0.48499882221221924, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0395, + "step": 15650 + }, + { + "epoch": 0.4477483917083631, + "grad_norm": 0.7251732349395752, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0399, + "step": 15660 + }, + { + "epoch": 0.44803431022158685, + "grad_norm": 0.3927334249019623, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0359, + "step": 15670 + }, + { + "epoch": 0.4483202287348106, + "grad_norm": 0.5614549517631531, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.035, + "step": 15680 + }, + { + "epoch": 0.4486061472480343, + "grad_norm": 0.383831262588501, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0416, + "step": 15690 + }, + { + "epoch": 0.44889206576125806, + "grad_norm": 1.9365276098251343, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0498, + "step": 15700 + }, + { + "epoch": 0.44917798427448175, + "grad_norm": 0.6964924931526184, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.034, + "step": 15710 + }, + { + "epoch": 0.4494639027877055, + "grad_norm": 0.5148108601570129, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0401, + "step": 15720 + }, + { + "epoch": 0.4497498213009292, + "grad_norm": 0.4529317617416382, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0361, + "step": 15730 + }, + { + "epoch": 0.45003573981415296, + "grad_norm": 0.6648512482643127, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0365, + "step": 15740 + }, + { + "epoch": 0.4503216583273767, + "grad_norm": 0.8183113932609558, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0416, + "step": 15750 + }, + { + "epoch": 0.45060757684060043, + "grad_norm": 0.8802638649940491, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0406, + "step": 15760 + }, + { + "epoch": 0.45089349535382417, + "grad_norm": 0.6329004764556885, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0395, + "step": 15770 + }, + { + "epoch": 0.4511794138670479, + "grad_norm": 0.35283520817756653, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0364, + "step": 15780 + }, + { + "epoch": 0.45146533238027164, + "grad_norm": 0.5156061053276062, + "learning_rate": 1.071827766589186e-05, + "loss": 0.031, + "step": 15790 + }, + { + "epoch": 0.4517512508934953, + "grad_norm": 0.37875205278396606, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0375, + "step": 15800 + }, + { + "epoch": 0.45203716940671906, + "grad_norm": 0.5543273687362671, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0421, + "step": 15810 + }, + { + "epoch": 0.4523230879199428, + "grad_norm": 0.3808431923389435, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 0.45260900643316654, + "grad_norm": 0.8648643493652344, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0396, + "step": 15830 + }, + { + "epoch": 0.4528949249463903, + "grad_norm": 0.7893536686897278, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0417, + "step": 15840 + }, + { + "epoch": 0.453180843459614, + "grad_norm": 0.904137134552002, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0384, + "step": 15850 + }, + { + "epoch": 0.45346676197283775, + "grad_norm": 0.6095889806747437, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0457, + "step": 15860 + }, + { + "epoch": 0.4537526804860615, + "grad_norm": 0.5691415667533875, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0438, + "step": 15870 + }, + { + "epoch": 0.4540385989992852, + "grad_norm": 0.37868618965148926, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0414, + "step": 15880 + }, + { + "epoch": 0.4543245175125089, + "grad_norm": 0.7962950468063354, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0405, + "step": 15890 + }, + { + "epoch": 0.45461043602573264, + "grad_norm": 0.8862378597259521, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0475, + "step": 15900 + }, + { + "epoch": 0.4548963545389564, + "grad_norm": 0.8762509822845459, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0472, + "step": 15910 + }, + { + "epoch": 0.4551822730521801, + "grad_norm": 0.6006313562393188, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0417, + "step": 15920 + }, + { + "epoch": 0.45546819156540386, + "grad_norm": 0.3340131938457489, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0374, + "step": 15930 + }, + { + "epoch": 0.4557541100786276, + "grad_norm": 0.2639552056789398, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0387, + "step": 15940 + }, + { + "epoch": 0.45604002859185133, + "grad_norm": 0.42564907670021057, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0376, + "step": 15950 + }, + { + "epoch": 0.45632594710507507, + "grad_norm": 0.503834068775177, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0344, + "step": 15960 + }, + { + "epoch": 0.4566118656182988, + "grad_norm": 0.5962334871292114, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0379, + "step": 15970 + }, + { + "epoch": 0.4568977841315225, + "grad_norm": 0.3271556794643402, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0361, + "step": 15980 + }, + { + "epoch": 0.4571837026447462, + "grad_norm": 0.5501612424850464, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0356, + "step": 15990 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 1.0399914979934692, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.039, + "step": 16000 + }, + { + "epoch": 0.4577555396711937, + "grad_norm": 0.42251288890838623, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0413, + "step": 16010 + }, + { + "epoch": 0.45804145818441744, + "grad_norm": 0.5694882869720459, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0501, + "step": 16020 + }, + { + "epoch": 0.4583273766976412, + "grad_norm": 0.37367814779281616, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0388, + "step": 16030 + }, + { + "epoch": 0.4586132952108649, + "grad_norm": 0.7947224974632263, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0324, + "step": 16040 + }, + { + "epoch": 0.45889921372408865, + "grad_norm": 0.47871798276901245, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0345, + "step": 16050 + }, + { + "epoch": 0.4591851322373124, + "grad_norm": 1.4443609714508057, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0502, + "step": 16060 + }, + { + "epoch": 0.45947105075053607, + "grad_norm": 0.8326191902160645, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0325, + "step": 16070 + }, + { + "epoch": 0.4597569692637598, + "grad_norm": 0.2887400686740875, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.035, + "step": 16080 + }, + { + "epoch": 0.46004288777698354, + "grad_norm": 0.34353405237197876, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0324, + "step": 16090 + }, + { + "epoch": 0.4603288062902073, + "grad_norm": 0.7319850325584412, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0307, + "step": 16100 + }, + { + "epoch": 0.460614724803431, + "grad_norm": 0.6628556847572327, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0398, + "step": 16110 + }, + { + "epoch": 0.46090064331665476, + "grad_norm": 0.39974722266197205, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.038, + "step": 16120 + }, + { + "epoch": 0.4611865618298785, + "grad_norm": 0.7769339680671692, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0425, + "step": 16130 + }, + { + "epoch": 0.46147248034310223, + "grad_norm": 0.6823691129684448, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.039, + "step": 16140 + }, + { + "epoch": 0.46175839885632597, + "grad_norm": 0.6749460697174072, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0388, + "step": 16150 + }, + { + "epoch": 0.46204431736954965, + "grad_norm": 1.0745635032653809, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0406, + "step": 16160 + }, + { + "epoch": 0.4623302358827734, + "grad_norm": 0.8388734459877014, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0345, + "step": 16170 + }, + { + "epoch": 0.4626161543959971, + "grad_norm": 0.675828218460083, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0355, + "step": 16180 + }, + { + "epoch": 0.46290207290922086, + "grad_norm": 0.9872504472732544, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0374, + "step": 16190 + }, + { + "epoch": 0.4631879914224446, + "grad_norm": 0.4705125689506531, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0416, + "step": 16200 + }, + { + "epoch": 0.46347390993566834, + "grad_norm": 0.43577539920806885, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.041, + "step": 16210 + }, + { + "epoch": 0.4637598284488921, + "grad_norm": 0.6472166180610657, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0372, + "step": 16220 + }, + { + "epoch": 0.4640457469621158, + "grad_norm": 1.0108906030654907, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0464, + "step": 16230 + }, + { + "epoch": 0.46433166547533955, + "grad_norm": 0.6221884489059448, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0396, + "step": 16240 + }, + { + "epoch": 0.46461758398856323, + "grad_norm": 0.7375202178955078, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0365, + "step": 16250 + }, + { + "epoch": 0.46490350250178697, + "grad_norm": 0.5090222358703613, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0404, + "step": 16260 + }, + { + "epoch": 0.4651894210150107, + "grad_norm": 0.5641722679138184, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0424, + "step": 16270 + }, + { + "epoch": 0.46547533952823444, + "grad_norm": 0.3946240246295929, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0433, + "step": 16280 + }, + { + "epoch": 0.4657612580414582, + "grad_norm": 0.525059700012207, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0399, + "step": 16290 + }, + { + "epoch": 0.4660471765546819, + "grad_norm": 0.6106441617012024, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0417, + "step": 16300 + }, + { + "epoch": 0.46633309506790566, + "grad_norm": 0.7064299583435059, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0331, + "step": 16310 + }, + { + "epoch": 0.4666190135811294, + "grad_norm": 0.6251654624938965, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0377, + "step": 16320 + }, + { + "epoch": 0.46690493209435313, + "grad_norm": 0.6626482009887695, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0355, + "step": 16330 + }, + { + "epoch": 0.4671908506075768, + "grad_norm": 0.32827794551849365, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0438, + "step": 16340 + }, + { + "epoch": 0.46747676912080055, + "grad_norm": 1.147644281387329, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.041, + "step": 16350 + }, + { + "epoch": 0.4677626876340243, + "grad_norm": 0.5785626769065857, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0362, + "step": 16360 + }, + { + "epoch": 0.468048606147248, + "grad_norm": 0.7087936401367188, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0364, + "step": 16370 + }, + { + "epoch": 0.46833452466047176, + "grad_norm": 0.7729533314704895, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0357, + "step": 16380 + }, + { + "epoch": 0.4686204431736955, + "grad_norm": 0.9080077409744263, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0445, + "step": 16390 + }, + { + "epoch": 0.46890636168691924, + "grad_norm": 0.5273067355155945, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0395, + "step": 16400 + }, + { + "epoch": 0.469192280200143, + "grad_norm": 0.4801991581916809, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0469, + "step": 16410 + }, + { + "epoch": 0.4694781987133667, + "grad_norm": 0.38060688972473145, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0377, + "step": 16420 + }, + { + "epoch": 0.4697641172265904, + "grad_norm": 1.335648536682129, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0444, + "step": 16430 + }, + { + "epoch": 0.47005003573981413, + "grad_norm": 0.6224690079689026, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0365, + "step": 16440 + }, + { + "epoch": 0.47033595425303787, + "grad_norm": 0.39938899874687195, + "learning_rate": 1.007637577910799e-05, + "loss": 0.037, + "step": 16450 + }, + { + "epoch": 0.4706218727662616, + "grad_norm": 0.47899872064590454, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0371, + "step": 16460 + }, + { + "epoch": 0.47090779127948534, + "grad_norm": 0.8991144895553589, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0337, + "step": 16470 + }, + { + "epoch": 0.4711937097927091, + "grad_norm": 0.6228598356246948, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0388, + "step": 16480 + }, + { + "epoch": 0.4714796283059328, + "grad_norm": 0.41108259558677673, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0378, + "step": 16490 + }, + { + "epoch": 0.47176554681915656, + "grad_norm": 0.722955048084259, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0381, + "step": 16500 + }, + { + "epoch": 0.4720514653323803, + "grad_norm": 0.6090973019599915, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0348, + "step": 16510 + }, + { + "epoch": 0.472337383845604, + "grad_norm": 0.483549565076828, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0456, + "step": 16520 + }, + { + "epoch": 0.4726233023588277, + "grad_norm": 0.4134727418422699, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0444, + "step": 16530 + }, + { + "epoch": 0.47290922087205145, + "grad_norm": 0.4629753530025482, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0382, + "step": 16540 + }, + { + "epoch": 0.4731951393852752, + "grad_norm": 0.8709504008293152, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0384, + "step": 16550 + }, + { + "epoch": 0.4734810578984989, + "grad_norm": 0.683397114276886, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0398, + "step": 16560 + }, + { + "epoch": 0.47376697641172266, + "grad_norm": 0.5743465423583984, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0431, + "step": 16570 + }, + { + "epoch": 0.4740528949249464, + "grad_norm": 1.0080480575561523, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0378, + "step": 16580 + }, + { + "epoch": 0.47433881343817014, + "grad_norm": 0.4668700098991394, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0369, + "step": 16590 + }, + { + "epoch": 0.4746247319513939, + "grad_norm": 0.6005896925926208, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0508, + "step": 16600 + }, + { + "epoch": 0.47491065046461756, + "grad_norm": 0.5788530707359314, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0354, + "step": 16610 + }, + { + "epoch": 0.4751965689778413, + "grad_norm": 0.38784441351890564, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0357, + "step": 16620 + }, + { + "epoch": 0.47548248749106503, + "grad_norm": 0.4809567928314209, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0331, + "step": 16630 + }, + { + "epoch": 0.47576840600428877, + "grad_norm": 0.6647809147834778, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0473, + "step": 16640 + }, + { + "epoch": 0.4760543245175125, + "grad_norm": 0.3968522548675537, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0304, + "step": 16650 + }, + { + "epoch": 0.47634024303073624, + "grad_norm": 0.3258526027202606, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0387, + "step": 16660 + }, + { + "epoch": 0.47662616154396, + "grad_norm": 0.43442079424858093, + "learning_rate": 9.863295834019308e-06, + "loss": 0.04, + "step": 16670 + }, + { + "epoch": 0.4769120800571837, + "grad_norm": 0.36909565329551697, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0351, + "step": 16680 + }, + { + "epoch": 0.47719799857040746, + "grad_norm": 0.5566768050193787, + "learning_rate": 9.843955128197274e-06, + "loss": 0.031, + "step": 16690 + }, + { + "epoch": 0.47748391708363114, + "grad_norm": 0.5705142617225647, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0359, + "step": 16700 + }, + { + "epoch": 0.4777698355968549, + "grad_norm": 0.28931716084480286, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0407, + "step": 16710 + }, + { + "epoch": 0.4780557541100786, + "grad_norm": 0.5509498715400696, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0363, + "step": 16720 + }, + { + "epoch": 0.47834167262330235, + "grad_norm": 0.3564346432685852, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0364, + "step": 16730 + }, + { + "epoch": 0.4786275911365261, + "grad_norm": 0.32734423875808716, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0369, + "step": 16740 + }, + { + "epoch": 0.4789135096497498, + "grad_norm": 0.3048594892024994, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0367, + "step": 16750 + }, + { + "epoch": 0.47919942816297356, + "grad_norm": 0.9007049798965454, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0377, + "step": 16760 + }, + { + "epoch": 0.4794853466761973, + "grad_norm": 0.7010983824729919, + "learning_rate": 9.76664747972605e-06, + "loss": 0.039, + "step": 16770 + }, + { + "epoch": 0.47977126518942104, + "grad_norm": 0.644473135471344, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0466, + "step": 16780 + }, + { + "epoch": 0.4800571837026447, + "grad_norm": 0.6333492398262024, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0373, + "step": 16790 + }, + { + "epoch": 0.48034310221586846, + "grad_norm": 0.5148355960845947, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0392, + "step": 16800 + }, + { + "epoch": 0.4806290207290922, + "grad_norm": 0.7288355231285095, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0381, + "step": 16810 + }, + { + "epoch": 0.48091493924231593, + "grad_norm": 0.3674873113632202, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0418, + "step": 16820 + }, + { + "epoch": 0.48120085775553967, + "grad_norm": 0.5055420398712158, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0336, + "step": 16830 + }, + { + "epoch": 0.4814867762687634, + "grad_norm": 0.641754686832428, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0342, + "step": 16840 + }, + { + "epoch": 0.48177269478198714, + "grad_norm": 0.308200478553772, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0364, + "step": 16850 + }, + { + "epoch": 0.4820586132952109, + "grad_norm": 0.41361021995544434, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0342, + "step": 16860 + }, + { + "epoch": 0.4823445318084346, + "grad_norm": 0.45777833461761475, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0353, + "step": 16870 + }, + { + "epoch": 0.4826304503216583, + "grad_norm": 0.7587664723396301, + "learning_rate": 9.660501900166734e-06, + "loss": 0.043, + "step": 16880 + }, + { + "epoch": 0.48291636883488204, + "grad_norm": 0.8740283250808716, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0372, + "step": 16890 + }, + { + "epoch": 0.4832022873481058, + "grad_norm": 0.3009270429611206, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0373, + "step": 16900 + }, + { + "epoch": 0.4834882058613295, + "grad_norm": 0.4439285695552826, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0349, + "step": 16910 + }, + { + "epoch": 0.48377412437455325, + "grad_norm": 0.39849671721458435, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0394, + "step": 16920 + }, + { + "epoch": 0.484060042887777, + "grad_norm": 0.6423043608665466, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0413, + "step": 16930 + }, + { + "epoch": 0.4843459614010007, + "grad_norm": 0.3683928847312927, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0387, + "step": 16940 + }, + { + "epoch": 0.48463187991422446, + "grad_norm": 0.7087769508361816, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0397, + "step": 16950 + }, + { + "epoch": 0.4849177984274482, + "grad_norm": 0.5348120927810669, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0405, + "step": 16960 + }, + { + "epoch": 0.4852037169406719, + "grad_norm": 0.549891471862793, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0363, + "step": 16970 + }, + { + "epoch": 0.4854896354538956, + "grad_norm": 0.7177272439002991, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0343, + "step": 16980 + }, + { + "epoch": 0.48577555396711936, + "grad_norm": 0.595417320728302, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0439, + "step": 16990 + }, + { + "epoch": 0.4860614724803431, + "grad_norm": 0.4838889241218567, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0387, + "step": 17000 + }, + { + "epoch": 0.48634739099356683, + "grad_norm": 0.6186223030090332, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0362, + "step": 17010 + }, + { + "epoch": 0.48663330950679057, + "grad_norm": 0.43383121490478516, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0381, + "step": 17020 + }, + { + "epoch": 0.4869192280200143, + "grad_norm": 0.6735527515411377, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0388, + "step": 17030 + }, + { + "epoch": 0.48720514653323804, + "grad_norm": 0.3746320605278015, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0491, + "step": 17040 + }, + { + "epoch": 0.4874910650464618, + "grad_norm": 0.29500988125801086, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0395, + "step": 17050 + }, + { + "epoch": 0.48777698355968546, + "grad_norm": 0.8518465757369995, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0435, + "step": 17060 + }, + { + "epoch": 0.4880629020729092, + "grad_norm": 0.9653190970420837, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0393, + "step": 17070 + }, + { + "epoch": 0.48834882058613294, + "grad_norm": 0.785724937915802, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0372, + "step": 17080 + }, + { + "epoch": 0.4886347390993567, + "grad_norm": 0.9450638890266418, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0406, + "step": 17090 + }, + { + "epoch": 0.4889206576125804, + "grad_norm": 0.645124077796936, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0361, + "step": 17100 + }, + { + "epoch": 0.48920657612580415, + "grad_norm": 0.3352372944355011, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0417, + "step": 17110 + }, + { + "epoch": 0.4894924946390279, + "grad_norm": 0.3858814835548401, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0345, + "step": 17120 + }, + { + "epoch": 0.4897784131522516, + "grad_norm": 0.5403604507446289, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0326, + "step": 17130 + }, + { + "epoch": 0.49006433166547536, + "grad_norm": 0.6986777782440186, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0417, + "step": 17140 + }, + { + "epoch": 0.49035025017869904, + "grad_norm": 0.5456675887107849, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0473, + "step": 17150 + }, + { + "epoch": 0.4906361686919228, + "grad_norm": 0.3961554765701294, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0341, + "step": 17160 + }, + { + "epoch": 0.4909220872051465, + "grad_norm": 0.5188277363777161, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0369, + "step": 17170 + }, + { + "epoch": 0.49120800571837026, + "grad_norm": 0.6042230725288391, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0352, + "step": 17180 + }, + { + "epoch": 0.491493924231594, + "grad_norm": 0.5485941171646118, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0405, + "step": 17190 + }, + { + "epoch": 0.49177984274481773, + "grad_norm": 0.5856509804725647, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0402, + "step": 17200 + }, + { + "epoch": 0.49206576125804147, + "grad_norm": 0.8656556010246277, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0349, + "step": 17210 + }, + { + "epoch": 0.4923516797712652, + "grad_norm": 0.4041757583618164, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0364, + "step": 17220 + }, + { + "epoch": 0.49263759828448894, + "grad_norm": 0.6135975122451782, + "learning_rate": 9.324104146177972e-06, + "loss": 0.036, + "step": 17230 + }, + { + "epoch": 0.4929235167977126, + "grad_norm": 0.5101860165596008, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0359, + "step": 17240 + }, + { + "epoch": 0.49320943531093636, + "grad_norm": 0.9913426041603088, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0552, + "step": 17250 + }, + { + "epoch": 0.4934953538241601, + "grad_norm": 0.6148158311843872, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0388, + "step": 17260 + }, + { + "epoch": 0.49378127233738384, + "grad_norm": 0.6651721596717834, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0374, + "step": 17270 + }, + { + "epoch": 0.4940671908506076, + "grad_norm": 0.9545061588287354, + "learning_rate": 9.276232738281744e-06, + "loss": 0.035, + "step": 17280 + }, + { + "epoch": 0.4943531093638313, + "grad_norm": 0.8923225402832031, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0366, + "step": 17290 + }, + { + "epoch": 0.49463902787705505, + "grad_norm": 0.5337848663330078, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0354, + "step": 17300 + }, + { + "epoch": 0.4949249463902788, + "grad_norm": 0.35039281845092773, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0341, + "step": 17310 + }, + { + "epoch": 0.4952108649035025, + "grad_norm": 0.47406911849975586, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0393, + "step": 17320 + }, + { + "epoch": 0.4954967834167262, + "grad_norm": 0.6226631999015808, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0375, + "step": 17330 + }, + { + "epoch": 0.49578270192994994, + "grad_norm": 0.6652712821960449, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0363, + "step": 17340 + }, + { + "epoch": 0.4960686204431737, + "grad_norm": 1.0042835474014282, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0368, + "step": 17350 + }, + { + "epoch": 0.4963545389563974, + "grad_norm": 0.4334045648574829, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0375, + "step": 17360 + }, + { + "epoch": 0.49664045746962115, + "grad_norm": 0.3561633229255676, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0347, + "step": 17370 + }, + { + "epoch": 0.4969263759828449, + "grad_norm": 0.5763550996780396, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0344, + "step": 17380 + }, + { + "epoch": 0.49721229449606863, + "grad_norm": 0.6306643486022949, + "learning_rate": 9.171095634265995e-06, + "loss": 0.037, + "step": 17390 + }, + { + "epoch": 0.49749821300929237, + "grad_norm": 0.4286569058895111, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0308, + "step": 17400 + }, + { + "epoch": 0.4977841315225161, + "grad_norm": 0.577983558177948, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0393, + "step": 17410 + }, + { + "epoch": 0.4980700500357398, + "grad_norm": 0.5714932084083557, + "learning_rate": 9.142466323573853e-06, + "loss": 0.038, + "step": 17420 + }, + { + "epoch": 0.4983559685489635, + "grad_norm": 0.7529498338699341, + "learning_rate": 9.132927564918328e-06, + "loss": 0.033, + "step": 17430 + }, + { + "epoch": 0.49864188706218726, + "grad_norm": 0.5179672241210938, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0367, + "step": 17440 + }, + { + "epoch": 0.498927805575411, + "grad_norm": 0.38424569368362427, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0401, + "step": 17450 + }, + { + "epoch": 0.49921372408863474, + "grad_norm": 0.469460129737854, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0379, + "step": 17460 + }, + { + "epoch": 0.4994996426018585, + "grad_norm": 0.3285387456417084, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0399, + "step": 17470 + }, + { + "epoch": 0.4997855611150822, + "grad_norm": 0.49863550066947937, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0313, + "step": 17480 + }, + { + "epoch": 0.5000714796283059, + "grad_norm": 0.3926186263561249, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0454, + "step": 17490 + }, + { + "epoch": 0.5003573981415297, + "grad_norm": 0.4476146399974823, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0472, + "step": 17500 + }, + { + "epoch": 0.5006433166547534, + "grad_norm": 0.5645599961280823, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0358, + "step": 17510 + }, + { + "epoch": 0.5009292351679772, + "grad_norm": 0.4813307225704193, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0364, + "step": 17520 + }, + { + "epoch": 0.5012151536812008, + "grad_norm": 0.49410971999168396, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0303, + "step": 17530 + }, + { + "epoch": 0.5015010721944246, + "grad_norm": 0.7172105312347412, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0404, + "step": 17540 + }, + { + "epoch": 0.5017869907076483, + "grad_norm": 0.43401873111724854, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0402, + "step": 17550 + }, + { + "epoch": 0.502072909220872, + "grad_norm": 0.6497406363487244, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0364, + "step": 17560 + }, + { + "epoch": 0.5023588277340958, + "grad_norm": 0.44618356227874756, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0337, + "step": 17570 + }, + { + "epoch": 0.5026447462473195, + "grad_norm": 0.4186992049217224, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0381, + "step": 17580 + }, + { + "epoch": 0.5029306647605433, + "grad_norm": 0.7387974858283997, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0319, + "step": 17590 + }, + { + "epoch": 0.503216583273767, + "grad_norm": 0.8068642020225525, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0373, + "step": 17600 + }, + { + "epoch": 0.5035025017869907, + "grad_norm": 0.5773473978042603, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0372, + "step": 17610 + }, + { + "epoch": 0.5037884203002144, + "grad_norm": 0.32488778233528137, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0334, + "step": 17620 + }, + { + "epoch": 0.5040743388134382, + "grad_norm": 0.33978500962257385, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0493, + "step": 17630 + }, + { + "epoch": 0.5043602573266619, + "grad_norm": 0.5897071361541748, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0335, + "step": 17640 + }, + { + "epoch": 0.5046461758398856, + "grad_norm": 0.6275895833969116, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0395, + "step": 17650 + }, + { + "epoch": 0.5049320943531094, + "grad_norm": 0.7995536923408508, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0422, + "step": 17660 + }, + { + "epoch": 0.505218012866333, + "grad_norm": 0.8734716773033142, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0414, + "step": 17670 + }, + { + "epoch": 0.5055039313795568, + "grad_norm": 0.6239343881607056, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0333, + "step": 17680 + }, + { + "epoch": 0.5057898498927805, + "grad_norm": 0.42508623003959656, + "learning_rate": 8.885721609997551e-06, + "loss": 0.045, + "step": 17690 + }, + { + "epoch": 0.5060757684060043, + "grad_norm": 0.4272485673427582, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0506, + "step": 17700 + }, + { + "epoch": 0.506361686919228, + "grad_norm": 0.8006368279457092, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0431, + "step": 17710 + }, + { + "epoch": 0.5066476054324518, + "grad_norm": 0.5896835327148438, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0322, + "step": 17720 + }, + { + "epoch": 0.5069335239456755, + "grad_norm": 0.6880389451980591, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0322, + "step": 17730 + }, + { + "epoch": 0.5072194424588992, + "grad_norm": 1.4850202798843384, + "learning_rate": 8.83836825410936e-06, + "loss": 0.052, + "step": 17740 + }, + { + "epoch": 0.507505360972123, + "grad_norm": 0.7684240937232971, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0353, + "step": 17750 + }, + { + "epoch": 0.5077912794853466, + "grad_norm": 0.5456307530403137, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0419, + "step": 17760 + }, + { + "epoch": 0.5080771979985704, + "grad_norm": 0.5775120258331299, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0366, + "step": 17770 + }, + { + "epoch": 0.5083631165117941, + "grad_norm": 0.6453070044517517, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0341, + "step": 17780 + }, + { + "epoch": 0.5086490350250179, + "grad_norm": 0.7906973361968994, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0405, + "step": 17790 + }, + { + "epoch": 0.5089349535382416, + "grad_norm": 1.0740606784820557, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0344, + "step": 17800 + }, + { + "epoch": 0.5092208720514654, + "grad_norm": 0.41854357719421387, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0334, + "step": 17810 + }, + { + "epoch": 0.5095067905646891, + "grad_norm": 0.6328964233398438, + "learning_rate": 8.762735374981932e-06, + "loss": 0.036, + "step": 17820 + }, + { + "epoch": 0.5097927090779127, + "grad_norm": 0.40875789523124695, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0338, + "step": 17830 + }, + { + "epoch": 0.5100786275911365, + "grad_norm": 0.5056312084197998, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0332, + "step": 17840 + }, + { + "epoch": 0.5103645461043602, + "grad_norm": 0.5005037784576416, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0416, + "step": 17850 + }, + { + "epoch": 0.510650464617584, + "grad_norm": 0.5689167380332947, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0329, + "step": 17860 + }, + { + "epoch": 0.5109363831308077, + "grad_norm": 0.5222717523574829, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0336, + "step": 17870 + }, + { + "epoch": 0.5112223016440315, + "grad_norm": 0.5998329520225525, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0354, + "step": 17880 + }, + { + "epoch": 0.5115082201572552, + "grad_norm": 0.4684480130672455, + "learning_rate": 8.69669425266315e-06, + "loss": 0.05, + "step": 17890 + }, + { + "epoch": 0.511794138670479, + "grad_norm": 0.4061124622821808, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0384, + "step": 17900 + }, + { + "epoch": 0.5120800571837026, + "grad_norm": 0.5025928020477295, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0386, + "step": 17910 + }, + { + "epoch": 0.5123659756969263, + "grad_norm": 0.3731222152709961, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0378, + "step": 17920 + }, + { + "epoch": 0.5126518942101501, + "grad_norm": 0.7784973978996277, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0419, + "step": 17930 + }, + { + "epoch": 0.5129378127233738, + "grad_norm": 0.7074074745178223, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0386, + "step": 17940 + }, + { + "epoch": 0.5132237312365976, + "grad_norm": 0.49802306294441223, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0418, + "step": 17950 + }, + { + "epoch": 0.5135096497498213, + "grad_norm": 0.4355427920818329, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0431, + "step": 17960 + }, + { + "epoch": 0.5137955682630451, + "grad_norm": 0.672635555267334, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0403, + "step": 17970 + }, + { + "epoch": 0.5140814867762687, + "grad_norm": 0.6733908653259277, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0487, + "step": 17980 + }, + { + "epoch": 0.5143674052894925, + "grad_norm": 0.43711504340171814, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0378, + "step": 17990 + }, + { + "epoch": 0.5146533238027162, + "grad_norm": 0.6371222138404846, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0336, + "step": 18000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.124350027628544e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..55ea0d594415ec9d9c2c174d6647679a60851be4 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5848bbd03f0ff3d7ce518be07ddb20c493b2ca15e22eb9afe25080ab697b7814 +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8662e3c76d846eb6096f73646ceaf5815c40efff --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dabff3e17a44df7a046e2d4ce1d2debafe2cc256157317ba15aebd0d6c47616 +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4464e054ac8cf8db979319ad197ba4e74deaa576 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3496ea0bcf2bea820754262fb0be5f1766700de2642826976f6bd8b9790a2f57 +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..46577ed8495b2d4154c80a2fa2ff880c3ab69dd6 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/trainer_state.json @@ -0,0 +1,14034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5718370264474625, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + }, + { + "epoch": 0.40057183702644744, + "grad_norm": 0.9077253937721252, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0415, + "step": 14010 + }, + { + "epoch": 0.4008577555396712, + "grad_norm": 0.9126781225204468, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.048, + "step": 14020 + }, + { + "epoch": 0.4011436740528949, + "grad_norm": 0.6264623999595642, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0411, + "step": 14030 + }, + { + "epoch": 0.40142959256611865, + "grad_norm": 0.523853600025177, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.051, + "step": 14040 + }, + { + "epoch": 0.4017155110793424, + "grad_norm": 0.6340035200119019, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0426, + "step": 14050 + }, + { + "epoch": 0.40200142959256613, + "grad_norm": 0.3594725430011749, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0397, + "step": 14060 + }, + { + "epoch": 0.40228734810578987, + "grad_norm": 0.941470742225647, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0402, + "step": 14070 + }, + { + "epoch": 0.4025732666190136, + "grad_norm": 0.840506911277771, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0473, + "step": 14080 + }, + { + "epoch": 0.4028591851322373, + "grad_norm": 0.3359200954437256, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0405, + "step": 14090 + }, + { + "epoch": 0.403145103645461, + "grad_norm": 0.49658629298210144, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0464, + "step": 14100 + }, + { + "epoch": 0.40343102215868476, + "grad_norm": 0.7940187454223633, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0417, + "step": 14110 + }, + { + "epoch": 0.4037169406719085, + "grad_norm": 0.30110660195350647, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0371, + "step": 14120 + }, + { + "epoch": 0.40400285918513223, + "grad_norm": 0.42845240235328674, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.053, + "step": 14130 + }, + { + "epoch": 0.40428877769835597, + "grad_norm": 0.997348427772522, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.041, + "step": 14140 + }, + { + "epoch": 0.4045746962115797, + "grad_norm": 0.4759966731071472, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0377, + "step": 14150 + }, + { + "epoch": 0.40486061472480345, + "grad_norm": 0.42045602202415466, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0397, + "step": 14160 + }, + { + "epoch": 0.4051465332380272, + "grad_norm": 0.6400002837181091, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0507, + "step": 14170 + }, + { + "epoch": 0.40543245175125087, + "grad_norm": 0.5473673939704895, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0359, + "step": 14180 + }, + { + "epoch": 0.4057183702644746, + "grad_norm": 0.7414730787277222, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0416, + "step": 14190 + }, + { + "epoch": 0.40600428877769834, + "grad_norm": 0.4691861867904663, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0363, + "step": 14200 + }, + { + "epoch": 0.4062902072909221, + "grad_norm": 0.9186112880706787, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0445, + "step": 14210 + }, + { + "epoch": 0.4065761258041458, + "grad_norm": 0.6782190203666687, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.40686204431736955, + "grad_norm": 0.6948013305664062, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.037, + "step": 14230 + }, + { + "epoch": 0.4071479628305933, + "grad_norm": 0.3034680485725403, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0371, + "step": 14240 + }, + { + "epoch": 0.40743388134381703, + "grad_norm": 0.4254174828529358, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0449, + "step": 14250 + }, + { + "epoch": 0.40771979985704077, + "grad_norm": 1.3622064590454102, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0428, + "step": 14260 + }, + { + "epoch": 0.40800571837026445, + "grad_norm": 0.5928359031677246, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0424, + "step": 14270 + }, + { + "epoch": 0.4082916368834882, + "grad_norm": 0.9103132486343384, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0414, + "step": 14280 + }, + { + "epoch": 0.4085775553967119, + "grad_norm": 0.6338028311729431, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0376, + "step": 14290 + }, + { + "epoch": 0.40886347390993566, + "grad_norm": 0.9920284748077393, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0393, + "step": 14300 + }, + { + "epoch": 0.4091493924231594, + "grad_norm": 0.411830335855484, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0336, + "step": 14310 + }, + { + "epoch": 0.40943531093638313, + "grad_norm": 0.6977682709693909, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0454, + "step": 14320 + }, + { + "epoch": 0.40972122944960687, + "grad_norm": 0.6303663849830627, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0453, + "step": 14330 + }, + { + "epoch": 0.4100071479628306, + "grad_norm": 0.3048207759857178, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0373, + "step": 14340 + }, + { + "epoch": 0.41029306647605435, + "grad_norm": 0.7683395743370056, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0438, + "step": 14350 + }, + { + "epoch": 0.41057898498927803, + "grad_norm": 0.5791511535644531, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0392, + "step": 14360 + }, + { + "epoch": 0.41086490350250177, + "grad_norm": 0.876626193523407, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0324, + "step": 14370 + }, + { + "epoch": 0.4111508220157255, + "grad_norm": 0.5971815586090088, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0368, + "step": 14380 + }, + { + "epoch": 0.41143674052894924, + "grad_norm": 0.6508862376213074, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0411, + "step": 14390 + }, + { + "epoch": 0.411722659042173, + "grad_norm": 0.4704359471797943, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0351, + "step": 14400 + }, + { + "epoch": 0.4120085775553967, + "grad_norm": 0.4266453683376312, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0367, + "step": 14410 + }, + { + "epoch": 0.41229449606862045, + "grad_norm": 0.5898434519767761, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0376, + "step": 14420 + }, + { + "epoch": 0.4125804145818442, + "grad_norm": 0.8741532564163208, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0419, + "step": 14430 + }, + { + "epoch": 0.41286633309506793, + "grad_norm": 0.24328190088272095, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0333, + "step": 14440 + }, + { + "epoch": 0.4131522516082916, + "grad_norm": 0.4263601303100586, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.039, + "step": 14450 + }, + { + "epoch": 0.41343817012151535, + "grad_norm": 0.6311615109443665, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0454, + "step": 14460 + }, + { + "epoch": 0.4137240886347391, + "grad_norm": 0.7424519658088684, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0392, + "step": 14470 + }, + { + "epoch": 0.4140100071479628, + "grad_norm": 0.48323145508766174, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0374, + "step": 14480 + }, + { + "epoch": 0.41429592566118656, + "grad_norm": 0.38597407937049866, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0393, + "step": 14490 + }, + { + "epoch": 0.4145818441744103, + "grad_norm": 0.7251518964767456, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0431, + "step": 14500 + }, + { + "epoch": 0.41486776268763403, + "grad_norm": 0.44361060857772827, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0426, + "step": 14510 + }, + { + "epoch": 0.41515368120085777, + "grad_norm": 0.5625014305114746, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0372, + "step": 14520 + }, + { + "epoch": 0.4154395997140815, + "grad_norm": 0.27855798602104187, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 0.4157255182273052, + "grad_norm": 0.5966296195983887, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0387, + "step": 14540 + }, + { + "epoch": 0.41601143674052893, + "grad_norm": 0.49445512890815735, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0355, + "step": 14550 + }, + { + "epoch": 0.41629735525375267, + "grad_norm": 0.3813278377056122, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0456, + "step": 14560 + }, + { + "epoch": 0.4165832737669764, + "grad_norm": 0.5962988138198853, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0401, + "step": 14570 + }, + { + "epoch": 0.41686919228020014, + "grad_norm": 0.4028547406196594, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0371, + "step": 14580 + }, + { + "epoch": 0.4171551107934239, + "grad_norm": 1.348706841468811, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0426, + "step": 14590 + }, + { + "epoch": 0.4174410293066476, + "grad_norm": 1.2782070636749268, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0393, + "step": 14600 + }, + { + "epoch": 0.41772694781987135, + "grad_norm": 1.0024999380111694, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0436, + "step": 14610 + }, + { + "epoch": 0.4180128663330951, + "grad_norm": 0.35450127720832825, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0411, + "step": 14620 + }, + { + "epoch": 0.41829878484631877, + "grad_norm": 0.5827250480651855, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0372, + "step": 14630 + }, + { + "epoch": 0.4185847033595425, + "grad_norm": 0.5905774235725403, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0394, + "step": 14640 + }, + { + "epoch": 0.41887062187276625, + "grad_norm": 0.652074933052063, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0405, + "step": 14650 + }, + { + "epoch": 0.41915654038599, + "grad_norm": 0.7245490550994873, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0473, + "step": 14660 + }, + { + "epoch": 0.4194424588992137, + "grad_norm": 0.5153012871742249, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.043, + "step": 14670 + }, + { + "epoch": 0.41972837741243746, + "grad_norm": 0.516107976436615, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0434, + "step": 14680 + }, + { + "epoch": 0.4200142959256612, + "grad_norm": 0.4743354618549347, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0429, + "step": 14690 + }, + { + "epoch": 0.42030021443888493, + "grad_norm": 0.547875165939331, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0395, + "step": 14700 + }, + { + "epoch": 0.42058613295210867, + "grad_norm": 0.6398400068283081, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0384, + "step": 14710 + }, + { + "epoch": 0.42087205146533235, + "grad_norm": 0.5891467332839966, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0399, + "step": 14720 + }, + { + "epoch": 0.4211579699785561, + "grad_norm": 0.3927595615386963, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0353, + "step": 14730 + }, + { + "epoch": 0.42144388849177983, + "grad_norm": 0.6477030515670776, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0492, + "step": 14740 + }, + { + "epoch": 0.42172980700500357, + "grad_norm": 0.7090615034103394, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.042, + "step": 14750 + }, + { + "epoch": 0.4220157255182273, + "grad_norm": 0.6572134494781494, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0406, + "step": 14760 + }, + { + "epoch": 0.42230164403145104, + "grad_norm": 0.787663996219635, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0424, + "step": 14770 + }, + { + "epoch": 0.4225875625446748, + "grad_norm": 0.8419309258460999, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0427, + "step": 14780 + }, + { + "epoch": 0.4228734810578985, + "grad_norm": 0.6204128861427307, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0364, + "step": 14790 + }, + { + "epoch": 0.42315939957112225, + "grad_norm": 0.7446070313453674, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 0.42344531808434593, + "grad_norm": 0.7446451783180237, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0384, + "step": 14810 + }, + { + "epoch": 0.42373123659756967, + "grad_norm": 0.6946475505828857, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0375, + "step": 14820 + }, + { + "epoch": 0.4240171551107934, + "grad_norm": 0.6997008323669434, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0393, + "step": 14830 + }, + { + "epoch": 0.42430307362401715, + "grad_norm": 0.4857316315174103, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0474, + "step": 14840 + }, + { + "epoch": 0.4245889921372409, + "grad_norm": 1.3516888618469238, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.047, + "step": 14850 + }, + { + "epoch": 0.4248749106504646, + "grad_norm": 0.40320220589637756, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0418, + "step": 14860 + }, + { + "epoch": 0.42516082916368836, + "grad_norm": 0.9002796411514282, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0434, + "step": 14870 + }, + { + "epoch": 0.4254467476769121, + "grad_norm": 0.3810071349143982, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0338, + "step": 14880 + }, + { + "epoch": 0.42573266619013583, + "grad_norm": 0.5786157250404358, + "learning_rate": 1.159527607963768e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.4260185847033595, + "grad_norm": 0.6316869258880615, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0388, + "step": 14900 + }, + { + "epoch": 0.42630450321658325, + "grad_norm": 0.608745276927948, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0426, + "step": 14910 + }, + { + "epoch": 0.426590421729807, + "grad_norm": 0.6655036807060242, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0433, + "step": 14920 + }, + { + "epoch": 0.4268763402430307, + "grad_norm": 0.29059523344039917, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0507, + "step": 14930 + }, + { + "epoch": 0.42716225875625446, + "grad_norm": 0.9066076278686523, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.4274481772694782, + "grad_norm": 1.0660220384597778, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0512, + "step": 14950 + }, + { + "epoch": 0.42773409578270194, + "grad_norm": 0.6081144213676453, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0426, + "step": 14960 + }, + { + "epoch": 0.4280200142959257, + "grad_norm": 0.46524369716644287, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0435, + "step": 14970 + }, + { + "epoch": 0.4283059328091494, + "grad_norm": 0.3497388958930969, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0492, + "step": 14980 + }, + { + "epoch": 0.4285918513223731, + "grad_norm": 0.41300803422927856, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 0.42887776983559683, + "grad_norm": 0.4363289177417755, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0358, + "step": 15000 + }, + { + "epoch": 0.42916368834882057, + "grad_norm": 1.314915418624878, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.047, + "step": 15010 + }, + { + "epoch": 0.4294496068620443, + "grad_norm": 0.558199942111969, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0313, + "step": 15020 + }, + { + "epoch": 0.42973552537526805, + "grad_norm": 0.3857463598251343, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0416, + "step": 15030 + }, + { + "epoch": 0.4300214438884918, + "grad_norm": 0.4701749384403229, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0425, + "step": 15040 + }, + { + "epoch": 0.4303073624017155, + "grad_norm": 0.4611213803291321, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0457, + "step": 15050 + }, + { + "epoch": 0.43059328091493926, + "grad_norm": 0.5338016152381897, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.038, + "step": 15060 + }, + { + "epoch": 0.430879199428163, + "grad_norm": 0.9078943133354187, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0395, + "step": 15070 + }, + { + "epoch": 0.4311651179413867, + "grad_norm": 0.5354048013687134, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0403, + "step": 15080 + }, + { + "epoch": 0.4314510364546104, + "grad_norm": 0.35511279106140137, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0377, + "step": 15090 + }, + { + "epoch": 0.43173695496783415, + "grad_norm": 0.37104350328445435, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0426, + "step": 15100 + }, + { + "epoch": 0.4320228734810579, + "grad_norm": 0.8916210532188416, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0387, + "step": 15110 + }, + { + "epoch": 0.4323087919942816, + "grad_norm": 0.514994740486145, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0384, + "step": 15120 + }, + { + "epoch": 0.43259471050750536, + "grad_norm": 0.8440690040588379, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0437, + "step": 15130 + }, + { + "epoch": 0.4328806290207291, + "grad_norm": 0.6815949082374573, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0453, + "step": 15140 + }, + { + "epoch": 0.43316654753395284, + "grad_norm": 0.33178189396858215, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0351, + "step": 15150 + }, + { + "epoch": 0.4334524660471766, + "grad_norm": 0.5686727166175842, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0368, + "step": 15160 + }, + { + "epoch": 0.43373838456040026, + "grad_norm": 0.44143930077552795, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0443, + "step": 15170 + }, + { + "epoch": 0.434024303073624, + "grad_norm": 0.3238232135772705, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0348, + "step": 15180 + }, + { + "epoch": 0.43431022158684773, + "grad_norm": 0.5038242340087891, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0343, + "step": 15190 + }, + { + "epoch": 0.43459614010007147, + "grad_norm": 0.4904351234436035, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0397, + "step": 15200 + }, + { + "epoch": 0.4348820586132952, + "grad_norm": 0.5325750708580017, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0499, + "step": 15210 + }, + { + "epoch": 0.43516797712651895, + "grad_norm": 0.39443954825401306, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 0.4354538956397427, + "grad_norm": 0.6782003045082092, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0358, + "step": 15230 + }, + { + "epoch": 0.4357398141529664, + "grad_norm": 0.47862571477890015, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0418, + "step": 15240 + }, + { + "epoch": 0.43602573266619016, + "grad_norm": 1.6515535116195679, + "learning_rate": 1.124468908014616e-05, + "loss": 0.043, + "step": 15250 + }, + { + "epoch": 0.43631165117941384, + "grad_norm": 0.4902660846710205, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0371, + "step": 15260 + }, + { + "epoch": 0.4365975696926376, + "grad_norm": 0.5742762088775635, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0369, + "step": 15270 + }, + { + "epoch": 0.4368834882058613, + "grad_norm": 0.42058590054512024, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0378, + "step": 15280 + }, + { + "epoch": 0.43716940671908505, + "grad_norm": 0.43729284405708313, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0352, + "step": 15290 + }, + { + "epoch": 0.4374553252323088, + "grad_norm": 0.4689466953277588, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0433, + "step": 15300 + }, + { + "epoch": 0.4377412437455325, + "grad_norm": 0.6272432208061218, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0548, + "step": 15310 + }, + { + "epoch": 0.43802716225875626, + "grad_norm": 1.1129611730575562, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0437, + "step": 15320 + }, + { + "epoch": 0.43831308077198, + "grad_norm": 0.9332655072212219, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0503, + "step": 15330 + }, + { + "epoch": 0.43859899928520374, + "grad_norm": 0.35150477290153503, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0351, + "step": 15340 + }, + { + "epoch": 0.4388849177984274, + "grad_norm": 0.3826565444469452, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0361, + "step": 15350 + }, + { + "epoch": 0.43917083631165116, + "grad_norm": 0.817319393157959, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0352, + "step": 15360 + }, + { + "epoch": 0.4394567548248749, + "grad_norm": 0.4379598796367645, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0469, + "step": 15370 + }, + { + "epoch": 0.43974267333809863, + "grad_norm": 0.6475314497947693, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0456, + "step": 15380 + }, + { + "epoch": 0.44002859185132237, + "grad_norm": 0.529088020324707, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0453, + "step": 15390 + }, + { + "epoch": 0.4403145103645461, + "grad_norm": 0.4915194809436798, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0369, + "step": 15400 + }, + { + "epoch": 0.44060042887776985, + "grad_norm": 0.4766380786895752, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0391, + "step": 15410 + }, + { + "epoch": 0.4408863473909936, + "grad_norm": 0.34667786955833435, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0327, + "step": 15420 + }, + { + "epoch": 0.4411722659042173, + "grad_norm": 0.504242479801178, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0413, + "step": 15430 + }, + { + "epoch": 0.441458184417441, + "grad_norm": 0.49786439538002014, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0361, + "step": 15440 + }, + { + "epoch": 0.44174410293066474, + "grad_norm": 0.4997329115867615, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0368, + "step": 15450 + }, + { + "epoch": 0.4420300214438885, + "grad_norm": 0.2992185056209564, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0359, + "step": 15460 + }, + { + "epoch": 0.4423159399571122, + "grad_norm": 0.6645393371582031, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0401, + "step": 15470 + }, + { + "epoch": 0.44260185847033595, + "grad_norm": 0.6327983140945435, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0386, + "step": 15480 + }, + { + "epoch": 0.4428877769835597, + "grad_norm": 0.45607903599739075, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 0.4431736954967834, + "grad_norm": 0.4401610493659973, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0417, + "step": 15500 + }, + { + "epoch": 0.44345961401000716, + "grad_norm": 0.5778466463088989, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0417, + "step": 15510 + }, + { + "epoch": 0.4437455325232309, + "grad_norm": 0.2164914309978485, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0355, + "step": 15520 + }, + { + "epoch": 0.4440314510364546, + "grad_norm": 0.3869318664073944, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0361, + "step": 15530 + }, + { + "epoch": 0.4443173695496783, + "grad_norm": 0.3843154311180115, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0459, + "step": 15540 + }, + { + "epoch": 0.44460328806290206, + "grad_norm": 0.8488825559616089, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0406, + "step": 15550 + }, + { + "epoch": 0.4448892065761258, + "grad_norm": 0.5055183172225952, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0359, + "step": 15560 + }, + { + "epoch": 0.44517512508934953, + "grad_norm": 0.40923011302948, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0435, + "step": 15570 + }, + { + "epoch": 0.44546104360257327, + "grad_norm": 0.48997730016708374, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0395, + "step": 15580 + }, + { + "epoch": 0.445746962115797, + "grad_norm": 0.5149131417274475, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.041, + "step": 15590 + }, + { + "epoch": 0.44603288062902074, + "grad_norm": 0.7277303338050842, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0452, + "step": 15600 + }, + { + "epoch": 0.4463187991422445, + "grad_norm": 0.48676377534866333, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0363, + "step": 15610 + }, + { + "epoch": 0.44660471765546816, + "grad_norm": 0.49031221866607666, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0356, + "step": 15620 + }, + { + "epoch": 0.4468906361686919, + "grad_norm": 0.38877514004707336, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.036, + "step": 15630 + }, + { + "epoch": 0.44717655468191564, + "grad_norm": 0.570068895816803, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0403, + "step": 15640 + }, + { + "epoch": 0.4474624731951394, + "grad_norm": 0.48499882221221924, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0395, + "step": 15650 + }, + { + "epoch": 0.4477483917083631, + "grad_norm": 0.7251732349395752, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0399, + "step": 15660 + }, + { + "epoch": 0.44803431022158685, + "grad_norm": 0.3927334249019623, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0359, + "step": 15670 + }, + { + "epoch": 0.4483202287348106, + "grad_norm": 0.5614549517631531, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.035, + "step": 15680 + }, + { + "epoch": 0.4486061472480343, + "grad_norm": 0.383831262588501, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0416, + "step": 15690 + }, + { + "epoch": 0.44889206576125806, + "grad_norm": 1.9365276098251343, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0498, + "step": 15700 + }, + { + "epoch": 0.44917798427448175, + "grad_norm": 0.6964924931526184, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.034, + "step": 15710 + }, + { + "epoch": 0.4494639027877055, + "grad_norm": 0.5148108601570129, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0401, + "step": 15720 + }, + { + "epoch": 0.4497498213009292, + "grad_norm": 0.4529317617416382, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0361, + "step": 15730 + }, + { + "epoch": 0.45003573981415296, + "grad_norm": 0.6648512482643127, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0365, + "step": 15740 + }, + { + "epoch": 0.4503216583273767, + "grad_norm": 0.8183113932609558, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0416, + "step": 15750 + }, + { + "epoch": 0.45060757684060043, + "grad_norm": 0.8802638649940491, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0406, + "step": 15760 + }, + { + "epoch": 0.45089349535382417, + "grad_norm": 0.6329004764556885, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0395, + "step": 15770 + }, + { + "epoch": 0.4511794138670479, + "grad_norm": 0.35283520817756653, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0364, + "step": 15780 + }, + { + "epoch": 0.45146533238027164, + "grad_norm": 0.5156061053276062, + "learning_rate": 1.071827766589186e-05, + "loss": 0.031, + "step": 15790 + }, + { + "epoch": 0.4517512508934953, + "grad_norm": 0.37875205278396606, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0375, + "step": 15800 + }, + { + "epoch": 0.45203716940671906, + "grad_norm": 0.5543273687362671, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0421, + "step": 15810 + }, + { + "epoch": 0.4523230879199428, + "grad_norm": 0.3808431923389435, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 0.45260900643316654, + "grad_norm": 0.8648643493652344, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0396, + "step": 15830 + }, + { + "epoch": 0.4528949249463903, + "grad_norm": 0.7893536686897278, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0417, + "step": 15840 + }, + { + "epoch": 0.453180843459614, + "grad_norm": 0.904137134552002, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0384, + "step": 15850 + }, + { + "epoch": 0.45346676197283775, + "grad_norm": 0.6095889806747437, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0457, + "step": 15860 + }, + { + "epoch": 0.4537526804860615, + "grad_norm": 0.5691415667533875, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0438, + "step": 15870 + }, + { + "epoch": 0.4540385989992852, + "grad_norm": 0.37868618965148926, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0414, + "step": 15880 + }, + { + "epoch": 0.4543245175125089, + "grad_norm": 0.7962950468063354, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0405, + "step": 15890 + }, + { + "epoch": 0.45461043602573264, + "grad_norm": 0.8862378597259521, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0475, + "step": 15900 + }, + { + "epoch": 0.4548963545389564, + "grad_norm": 0.8762509822845459, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0472, + "step": 15910 + }, + { + "epoch": 0.4551822730521801, + "grad_norm": 0.6006313562393188, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0417, + "step": 15920 + }, + { + "epoch": 0.45546819156540386, + "grad_norm": 0.3340131938457489, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0374, + "step": 15930 + }, + { + "epoch": 0.4557541100786276, + "grad_norm": 0.2639552056789398, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0387, + "step": 15940 + }, + { + "epoch": 0.45604002859185133, + "grad_norm": 0.42564907670021057, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0376, + "step": 15950 + }, + { + "epoch": 0.45632594710507507, + "grad_norm": 0.503834068775177, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0344, + "step": 15960 + }, + { + "epoch": 0.4566118656182988, + "grad_norm": 0.5962334871292114, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0379, + "step": 15970 + }, + { + "epoch": 0.4568977841315225, + "grad_norm": 0.3271556794643402, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0361, + "step": 15980 + }, + { + "epoch": 0.4571837026447462, + "grad_norm": 0.5501612424850464, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0356, + "step": 15990 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 1.0399914979934692, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.039, + "step": 16000 + }, + { + "epoch": 0.4577555396711937, + "grad_norm": 0.42251288890838623, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0413, + "step": 16010 + }, + { + "epoch": 0.45804145818441744, + "grad_norm": 0.5694882869720459, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0501, + "step": 16020 + }, + { + "epoch": 0.4583273766976412, + "grad_norm": 0.37367814779281616, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0388, + "step": 16030 + }, + { + "epoch": 0.4586132952108649, + "grad_norm": 0.7947224974632263, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0324, + "step": 16040 + }, + { + "epoch": 0.45889921372408865, + "grad_norm": 0.47871798276901245, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0345, + "step": 16050 + }, + { + "epoch": 0.4591851322373124, + "grad_norm": 1.4443609714508057, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0502, + "step": 16060 + }, + { + "epoch": 0.45947105075053607, + "grad_norm": 0.8326191902160645, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0325, + "step": 16070 + }, + { + "epoch": 0.4597569692637598, + "grad_norm": 0.2887400686740875, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.035, + "step": 16080 + }, + { + "epoch": 0.46004288777698354, + "grad_norm": 0.34353405237197876, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0324, + "step": 16090 + }, + { + "epoch": 0.4603288062902073, + "grad_norm": 0.7319850325584412, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0307, + "step": 16100 + }, + { + "epoch": 0.460614724803431, + "grad_norm": 0.6628556847572327, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0398, + "step": 16110 + }, + { + "epoch": 0.46090064331665476, + "grad_norm": 0.39974722266197205, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.038, + "step": 16120 + }, + { + "epoch": 0.4611865618298785, + "grad_norm": 0.7769339680671692, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0425, + "step": 16130 + }, + { + "epoch": 0.46147248034310223, + "grad_norm": 0.6823691129684448, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.039, + "step": 16140 + }, + { + "epoch": 0.46175839885632597, + "grad_norm": 0.6749460697174072, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0388, + "step": 16150 + }, + { + "epoch": 0.46204431736954965, + "grad_norm": 1.0745635032653809, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0406, + "step": 16160 + }, + { + "epoch": 0.4623302358827734, + "grad_norm": 0.8388734459877014, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0345, + "step": 16170 + }, + { + "epoch": 0.4626161543959971, + "grad_norm": 0.675828218460083, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0355, + "step": 16180 + }, + { + "epoch": 0.46290207290922086, + "grad_norm": 0.9872504472732544, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0374, + "step": 16190 + }, + { + "epoch": 0.4631879914224446, + "grad_norm": 0.4705125689506531, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0416, + "step": 16200 + }, + { + "epoch": 0.46347390993566834, + "grad_norm": 0.43577539920806885, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.041, + "step": 16210 + }, + { + "epoch": 0.4637598284488921, + "grad_norm": 0.6472166180610657, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0372, + "step": 16220 + }, + { + "epoch": 0.4640457469621158, + "grad_norm": 1.0108906030654907, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0464, + "step": 16230 + }, + { + "epoch": 0.46433166547533955, + "grad_norm": 0.6221884489059448, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0396, + "step": 16240 + }, + { + "epoch": 0.46461758398856323, + "grad_norm": 0.7375202178955078, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0365, + "step": 16250 + }, + { + "epoch": 0.46490350250178697, + "grad_norm": 0.5090222358703613, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0404, + "step": 16260 + }, + { + "epoch": 0.4651894210150107, + "grad_norm": 0.5641722679138184, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0424, + "step": 16270 + }, + { + "epoch": 0.46547533952823444, + "grad_norm": 0.3946240246295929, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0433, + "step": 16280 + }, + { + "epoch": 0.4657612580414582, + "grad_norm": 0.525059700012207, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0399, + "step": 16290 + }, + { + "epoch": 0.4660471765546819, + "grad_norm": 0.6106441617012024, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0417, + "step": 16300 + }, + { + "epoch": 0.46633309506790566, + "grad_norm": 0.7064299583435059, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0331, + "step": 16310 + }, + { + "epoch": 0.4666190135811294, + "grad_norm": 0.6251654624938965, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0377, + "step": 16320 + }, + { + "epoch": 0.46690493209435313, + "grad_norm": 0.6626482009887695, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0355, + "step": 16330 + }, + { + "epoch": 0.4671908506075768, + "grad_norm": 0.32827794551849365, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0438, + "step": 16340 + }, + { + "epoch": 0.46747676912080055, + "grad_norm": 1.147644281387329, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.041, + "step": 16350 + }, + { + "epoch": 0.4677626876340243, + "grad_norm": 0.5785626769065857, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0362, + "step": 16360 + }, + { + "epoch": 0.468048606147248, + "grad_norm": 0.7087936401367188, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0364, + "step": 16370 + }, + { + "epoch": 0.46833452466047176, + "grad_norm": 0.7729533314704895, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0357, + "step": 16380 + }, + { + "epoch": 0.4686204431736955, + "grad_norm": 0.9080077409744263, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0445, + "step": 16390 + }, + { + "epoch": 0.46890636168691924, + "grad_norm": 0.5273067355155945, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0395, + "step": 16400 + }, + { + "epoch": 0.469192280200143, + "grad_norm": 0.4801991581916809, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0469, + "step": 16410 + }, + { + "epoch": 0.4694781987133667, + "grad_norm": 0.38060688972473145, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0377, + "step": 16420 + }, + { + "epoch": 0.4697641172265904, + "grad_norm": 1.335648536682129, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0444, + "step": 16430 + }, + { + "epoch": 0.47005003573981413, + "grad_norm": 0.6224690079689026, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0365, + "step": 16440 + }, + { + "epoch": 0.47033595425303787, + "grad_norm": 0.39938899874687195, + "learning_rate": 1.007637577910799e-05, + "loss": 0.037, + "step": 16450 + }, + { + "epoch": 0.4706218727662616, + "grad_norm": 0.47899872064590454, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0371, + "step": 16460 + }, + { + "epoch": 0.47090779127948534, + "grad_norm": 0.8991144895553589, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0337, + "step": 16470 + }, + { + "epoch": 0.4711937097927091, + "grad_norm": 0.6228598356246948, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0388, + "step": 16480 + }, + { + "epoch": 0.4714796283059328, + "grad_norm": 0.41108259558677673, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0378, + "step": 16490 + }, + { + "epoch": 0.47176554681915656, + "grad_norm": 0.722955048084259, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0381, + "step": 16500 + }, + { + "epoch": 0.4720514653323803, + "grad_norm": 0.6090973019599915, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0348, + "step": 16510 + }, + { + "epoch": 0.472337383845604, + "grad_norm": 0.483549565076828, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0456, + "step": 16520 + }, + { + "epoch": 0.4726233023588277, + "grad_norm": 0.4134727418422699, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0444, + "step": 16530 + }, + { + "epoch": 0.47290922087205145, + "grad_norm": 0.4629753530025482, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0382, + "step": 16540 + }, + { + "epoch": 0.4731951393852752, + "grad_norm": 0.8709504008293152, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0384, + "step": 16550 + }, + { + "epoch": 0.4734810578984989, + "grad_norm": 0.683397114276886, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0398, + "step": 16560 + }, + { + "epoch": 0.47376697641172266, + "grad_norm": 0.5743465423583984, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0431, + "step": 16570 + }, + { + "epoch": 0.4740528949249464, + "grad_norm": 1.0080480575561523, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0378, + "step": 16580 + }, + { + "epoch": 0.47433881343817014, + "grad_norm": 0.4668700098991394, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0369, + "step": 16590 + }, + { + "epoch": 0.4746247319513939, + "grad_norm": 0.6005896925926208, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0508, + "step": 16600 + }, + { + "epoch": 0.47491065046461756, + "grad_norm": 0.5788530707359314, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0354, + "step": 16610 + }, + { + "epoch": 0.4751965689778413, + "grad_norm": 0.38784441351890564, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0357, + "step": 16620 + }, + { + "epoch": 0.47548248749106503, + "grad_norm": 0.4809567928314209, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0331, + "step": 16630 + }, + { + "epoch": 0.47576840600428877, + "grad_norm": 0.6647809147834778, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0473, + "step": 16640 + }, + { + "epoch": 0.4760543245175125, + "grad_norm": 0.3968522548675537, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0304, + "step": 16650 + }, + { + "epoch": 0.47634024303073624, + "grad_norm": 0.3258526027202606, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0387, + "step": 16660 + }, + { + "epoch": 0.47662616154396, + "grad_norm": 0.43442079424858093, + "learning_rate": 9.863295834019308e-06, + "loss": 0.04, + "step": 16670 + }, + { + "epoch": 0.4769120800571837, + "grad_norm": 0.36909565329551697, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0351, + "step": 16680 + }, + { + "epoch": 0.47719799857040746, + "grad_norm": 0.5566768050193787, + "learning_rate": 9.843955128197274e-06, + "loss": 0.031, + "step": 16690 + }, + { + "epoch": 0.47748391708363114, + "grad_norm": 0.5705142617225647, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0359, + "step": 16700 + }, + { + "epoch": 0.4777698355968549, + "grad_norm": 0.28931716084480286, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0407, + "step": 16710 + }, + { + "epoch": 0.4780557541100786, + "grad_norm": 0.5509498715400696, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0363, + "step": 16720 + }, + { + "epoch": 0.47834167262330235, + "grad_norm": 0.3564346432685852, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0364, + "step": 16730 + }, + { + "epoch": 0.4786275911365261, + "grad_norm": 0.32734423875808716, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0369, + "step": 16740 + }, + { + "epoch": 0.4789135096497498, + "grad_norm": 0.3048594892024994, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0367, + "step": 16750 + }, + { + "epoch": 0.47919942816297356, + "grad_norm": 0.9007049798965454, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0377, + "step": 16760 + }, + { + "epoch": 0.4794853466761973, + "grad_norm": 0.7010983824729919, + "learning_rate": 9.76664747972605e-06, + "loss": 0.039, + "step": 16770 + }, + { + "epoch": 0.47977126518942104, + "grad_norm": 0.644473135471344, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0466, + "step": 16780 + }, + { + "epoch": 0.4800571837026447, + "grad_norm": 0.6333492398262024, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0373, + "step": 16790 + }, + { + "epoch": 0.48034310221586846, + "grad_norm": 0.5148355960845947, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0392, + "step": 16800 + }, + { + "epoch": 0.4806290207290922, + "grad_norm": 0.7288355231285095, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0381, + "step": 16810 + }, + { + "epoch": 0.48091493924231593, + "grad_norm": 0.3674873113632202, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0418, + "step": 16820 + }, + { + "epoch": 0.48120085775553967, + "grad_norm": 0.5055420398712158, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0336, + "step": 16830 + }, + { + "epoch": 0.4814867762687634, + "grad_norm": 0.641754686832428, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0342, + "step": 16840 + }, + { + "epoch": 0.48177269478198714, + "grad_norm": 0.308200478553772, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0364, + "step": 16850 + }, + { + "epoch": 0.4820586132952109, + "grad_norm": 0.41361021995544434, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0342, + "step": 16860 + }, + { + "epoch": 0.4823445318084346, + "grad_norm": 0.45777833461761475, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0353, + "step": 16870 + }, + { + "epoch": 0.4826304503216583, + "grad_norm": 0.7587664723396301, + "learning_rate": 9.660501900166734e-06, + "loss": 0.043, + "step": 16880 + }, + { + "epoch": 0.48291636883488204, + "grad_norm": 0.8740283250808716, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0372, + "step": 16890 + }, + { + "epoch": 0.4832022873481058, + "grad_norm": 0.3009270429611206, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0373, + "step": 16900 + }, + { + "epoch": 0.4834882058613295, + "grad_norm": 0.4439285695552826, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0349, + "step": 16910 + }, + { + "epoch": 0.48377412437455325, + "grad_norm": 0.39849671721458435, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0394, + "step": 16920 + }, + { + "epoch": 0.484060042887777, + "grad_norm": 0.6423043608665466, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0413, + "step": 16930 + }, + { + "epoch": 0.4843459614010007, + "grad_norm": 0.3683928847312927, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0387, + "step": 16940 + }, + { + "epoch": 0.48463187991422446, + "grad_norm": 0.7087769508361816, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0397, + "step": 16950 + }, + { + "epoch": 0.4849177984274482, + "grad_norm": 0.5348120927810669, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0405, + "step": 16960 + }, + { + "epoch": 0.4852037169406719, + "grad_norm": 0.549891471862793, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0363, + "step": 16970 + }, + { + "epoch": 0.4854896354538956, + "grad_norm": 0.7177272439002991, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0343, + "step": 16980 + }, + { + "epoch": 0.48577555396711936, + "grad_norm": 0.595417320728302, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0439, + "step": 16990 + }, + { + "epoch": 0.4860614724803431, + "grad_norm": 0.4838889241218567, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0387, + "step": 17000 + }, + { + "epoch": 0.48634739099356683, + "grad_norm": 0.6186223030090332, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0362, + "step": 17010 + }, + { + "epoch": 0.48663330950679057, + "grad_norm": 0.43383121490478516, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0381, + "step": 17020 + }, + { + "epoch": 0.4869192280200143, + "grad_norm": 0.6735527515411377, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0388, + "step": 17030 + }, + { + "epoch": 0.48720514653323804, + "grad_norm": 0.3746320605278015, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0491, + "step": 17040 + }, + { + "epoch": 0.4874910650464618, + "grad_norm": 0.29500988125801086, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0395, + "step": 17050 + }, + { + "epoch": 0.48777698355968546, + "grad_norm": 0.8518465757369995, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0435, + "step": 17060 + }, + { + "epoch": 0.4880629020729092, + "grad_norm": 0.9653190970420837, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0393, + "step": 17070 + }, + { + "epoch": 0.48834882058613294, + "grad_norm": 0.785724937915802, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0372, + "step": 17080 + }, + { + "epoch": 0.4886347390993567, + "grad_norm": 0.9450638890266418, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0406, + "step": 17090 + }, + { + "epoch": 0.4889206576125804, + "grad_norm": 0.645124077796936, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0361, + "step": 17100 + }, + { + "epoch": 0.48920657612580415, + "grad_norm": 0.3352372944355011, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0417, + "step": 17110 + }, + { + "epoch": 0.4894924946390279, + "grad_norm": 0.3858814835548401, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0345, + "step": 17120 + }, + { + "epoch": 0.4897784131522516, + "grad_norm": 0.5403604507446289, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0326, + "step": 17130 + }, + { + "epoch": 0.49006433166547536, + "grad_norm": 0.6986777782440186, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0417, + "step": 17140 + }, + { + "epoch": 0.49035025017869904, + "grad_norm": 0.5456675887107849, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0473, + "step": 17150 + }, + { + "epoch": 0.4906361686919228, + "grad_norm": 0.3961554765701294, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0341, + "step": 17160 + }, + { + "epoch": 0.4909220872051465, + "grad_norm": 0.5188277363777161, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0369, + "step": 17170 + }, + { + "epoch": 0.49120800571837026, + "grad_norm": 0.6042230725288391, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0352, + "step": 17180 + }, + { + "epoch": 0.491493924231594, + "grad_norm": 0.5485941171646118, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0405, + "step": 17190 + }, + { + "epoch": 0.49177984274481773, + "grad_norm": 0.5856509804725647, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0402, + "step": 17200 + }, + { + "epoch": 0.49206576125804147, + "grad_norm": 0.8656556010246277, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0349, + "step": 17210 + }, + { + "epoch": 0.4923516797712652, + "grad_norm": 0.4041757583618164, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0364, + "step": 17220 + }, + { + "epoch": 0.49263759828448894, + "grad_norm": 0.6135975122451782, + "learning_rate": 9.324104146177972e-06, + "loss": 0.036, + "step": 17230 + }, + { + "epoch": 0.4929235167977126, + "grad_norm": 0.5101860165596008, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0359, + "step": 17240 + }, + { + "epoch": 0.49320943531093636, + "grad_norm": 0.9913426041603088, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0552, + "step": 17250 + }, + { + "epoch": 0.4934953538241601, + "grad_norm": 0.6148158311843872, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0388, + "step": 17260 + }, + { + "epoch": 0.49378127233738384, + "grad_norm": 0.6651721596717834, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0374, + "step": 17270 + }, + { + "epoch": 0.4940671908506076, + "grad_norm": 0.9545061588287354, + "learning_rate": 9.276232738281744e-06, + "loss": 0.035, + "step": 17280 + }, + { + "epoch": 0.4943531093638313, + "grad_norm": 0.8923225402832031, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0366, + "step": 17290 + }, + { + "epoch": 0.49463902787705505, + "grad_norm": 0.5337848663330078, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0354, + "step": 17300 + }, + { + "epoch": 0.4949249463902788, + "grad_norm": 0.35039281845092773, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0341, + "step": 17310 + }, + { + "epoch": 0.4952108649035025, + "grad_norm": 0.47406911849975586, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0393, + "step": 17320 + }, + { + "epoch": 0.4954967834167262, + "grad_norm": 0.6226631999015808, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0375, + "step": 17330 + }, + { + "epoch": 0.49578270192994994, + "grad_norm": 0.6652712821960449, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0363, + "step": 17340 + }, + { + "epoch": 0.4960686204431737, + "grad_norm": 1.0042835474014282, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0368, + "step": 17350 + }, + { + "epoch": 0.4963545389563974, + "grad_norm": 0.4334045648574829, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0375, + "step": 17360 + }, + { + "epoch": 0.49664045746962115, + "grad_norm": 0.3561633229255676, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0347, + "step": 17370 + }, + { + "epoch": 0.4969263759828449, + "grad_norm": 0.5763550996780396, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0344, + "step": 17380 + }, + { + "epoch": 0.49721229449606863, + "grad_norm": 0.6306643486022949, + "learning_rate": 9.171095634265995e-06, + "loss": 0.037, + "step": 17390 + }, + { + "epoch": 0.49749821300929237, + "grad_norm": 0.4286569058895111, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0308, + "step": 17400 + }, + { + "epoch": 0.4977841315225161, + "grad_norm": 0.577983558177948, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0393, + "step": 17410 + }, + { + "epoch": 0.4980700500357398, + "grad_norm": 0.5714932084083557, + "learning_rate": 9.142466323573853e-06, + "loss": 0.038, + "step": 17420 + }, + { + "epoch": 0.4983559685489635, + "grad_norm": 0.7529498338699341, + "learning_rate": 9.132927564918328e-06, + "loss": 0.033, + "step": 17430 + }, + { + "epoch": 0.49864188706218726, + "grad_norm": 0.5179672241210938, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0367, + "step": 17440 + }, + { + "epoch": 0.498927805575411, + "grad_norm": 0.38424569368362427, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0401, + "step": 17450 + }, + { + "epoch": 0.49921372408863474, + "grad_norm": 0.469460129737854, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0379, + "step": 17460 + }, + { + "epoch": 0.4994996426018585, + "grad_norm": 0.3285387456417084, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0399, + "step": 17470 + }, + { + "epoch": 0.4997855611150822, + "grad_norm": 0.49863550066947937, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0313, + "step": 17480 + }, + { + "epoch": 0.5000714796283059, + "grad_norm": 0.3926186263561249, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0454, + "step": 17490 + }, + { + "epoch": 0.5003573981415297, + "grad_norm": 0.4476146399974823, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0472, + "step": 17500 + }, + { + "epoch": 0.5006433166547534, + "grad_norm": 0.5645599961280823, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0358, + "step": 17510 + }, + { + "epoch": 0.5009292351679772, + "grad_norm": 0.4813307225704193, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0364, + "step": 17520 + }, + { + "epoch": 0.5012151536812008, + "grad_norm": 0.49410971999168396, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0303, + "step": 17530 + }, + { + "epoch": 0.5015010721944246, + "grad_norm": 0.7172105312347412, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0404, + "step": 17540 + }, + { + "epoch": 0.5017869907076483, + "grad_norm": 0.43401873111724854, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0402, + "step": 17550 + }, + { + "epoch": 0.502072909220872, + "grad_norm": 0.6497406363487244, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0364, + "step": 17560 + }, + { + "epoch": 0.5023588277340958, + "grad_norm": 0.44618356227874756, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0337, + "step": 17570 + }, + { + "epoch": 0.5026447462473195, + "grad_norm": 0.4186992049217224, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0381, + "step": 17580 + }, + { + "epoch": 0.5029306647605433, + "grad_norm": 0.7387974858283997, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0319, + "step": 17590 + }, + { + "epoch": 0.503216583273767, + "grad_norm": 0.8068642020225525, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0373, + "step": 17600 + }, + { + "epoch": 0.5035025017869907, + "grad_norm": 0.5773473978042603, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0372, + "step": 17610 + }, + { + "epoch": 0.5037884203002144, + "grad_norm": 0.32488778233528137, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0334, + "step": 17620 + }, + { + "epoch": 0.5040743388134382, + "grad_norm": 0.33978500962257385, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0493, + "step": 17630 + }, + { + "epoch": 0.5043602573266619, + "grad_norm": 0.5897071361541748, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0335, + "step": 17640 + }, + { + "epoch": 0.5046461758398856, + "grad_norm": 0.6275895833969116, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0395, + "step": 17650 + }, + { + "epoch": 0.5049320943531094, + "grad_norm": 0.7995536923408508, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0422, + "step": 17660 + }, + { + "epoch": 0.505218012866333, + "grad_norm": 0.8734716773033142, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0414, + "step": 17670 + }, + { + "epoch": 0.5055039313795568, + "grad_norm": 0.6239343881607056, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0333, + "step": 17680 + }, + { + "epoch": 0.5057898498927805, + "grad_norm": 0.42508623003959656, + "learning_rate": 8.885721609997551e-06, + "loss": 0.045, + "step": 17690 + }, + { + "epoch": 0.5060757684060043, + "grad_norm": 0.4272485673427582, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0506, + "step": 17700 + }, + { + "epoch": 0.506361686919228, + "grad_norm": 0.8006368279457092, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0431, + "step": 17710 + }, + { + "epoch": 0.5066476054324518, + "grad_norm": 0.5896835327148438, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0322, + "step": 17720 + }, + { + "epoch": 0.5069335239456755, + "grad_norm": 0.6880389451980591, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0322, + "step": 17730 + }, + { + "epoch": 0.5072194424588992, + "grad_norm": 1.4850202798843384, + "learning_rate": 8.83836825410936e-06, + "loss": 0.052, + "step": 17740 + }, + { + "epoch": 0.507505360972123, + "grad_norm": 0.7684240937232971, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0353, + "step": 17750 + }, + { + "epoch": 0.5077912794853466, + "grad_norm": 0.5456307530403137, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0419, + "step": 17760 + }, + { + "epoch": 0.5080771979985704, + "grad_norm": 0.5775120258331299, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0366, + "step": 17770 + }, + { + "epoch": 0.5083631165117941, + "grad_norm": 0.6453070044517517, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0341, + "step": 17780 + }, + { + "epoch": 0.5086490350250179, + "grad_norm": 0.7906973361968994, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0405, + "step": 17790 + }, + { + "epoch": 0.5089349535382416, + "grad_norm": 1.0740606784820557, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0344, + "step": 17800 + }, + { + "epoch": 0.5092208720514654, + "grad_norm": 0.41854357719421387, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0334, + "step": 17810 + }, + { + "epoch": 0.5095067905646891, + "grad_norm": 0.6328964233398438, + "learning_rate": 8.762735374981932e-06, + "loss": 0.036, + "step": 17820 + }, + { + "epoch": 0.5097927090779127, + "grad_norm": 0.40875789523124695, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0338, + "step": 17830 + }, + { + "epoch": 0.5100786275911365, + "grad_norm": 0.5056312084197998, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0332, + "step": 17840 + }, + { + "epoch": 0.5103645461043602, + "grad_norm": 0.5005037784576416, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0416, + "step": 17850 + }, + { + "epoch": 0.510650464617584, + "grad_norm": 0.5689167380332947, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0329, + "step": 17860 + }, + { + "epoch": 0.5109363831308077, + "grad_norm": 0.5222717523574829, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0336, + "step": 17870 + }, + { + "epoch": 0.5112223016440315, + "grad_norm": 0.5998329520225525, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0354, + "step": 17880 + }, + { + "epoch": 0.5115082201572552, + "grad_norm": 0.4684480130672455, + "learning_rate": 8.69669425266315e-06, + "loss": 0.05, + "step": 17890 + }, + { + "epoch": 0.511794138670479, + "grad_norm": 0.4061124622821808, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0384, + "step": 17900 + }, + { + "epoch": 0.5120800571837026, + "grad_norm": 0.5025928020477295, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0386, + "step": 17910 + }, + { + "epoch": 0.5123659756969263, + "grad_norm": 0.3731222152709961, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0378, + "step": 17920 + }, + { + "epoch": 0.5126518942101501, + "grad_norm": 0.7784973978996277, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0419, + "step": 17930 + }, + { + "epoch": 0.5129378127233738, + "grad_norm": 0.7074074745178223, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0386, + "step": 17940 + }, + { + "epoch": 0.5132237312365976, + "grad_norm": 0.49802306294441223, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0418, + "step": 17950 + }, + { + "epoch": 0.5135096497498213, + "grad_norm": 0.4355427920818329, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0431, + "step": 17960 + }, + { + "epoch": 0.5137955682630451, + "grad_norm": 0.672635555267334, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0403, + "step": 17970 + }, + { + "epoch": 0.5140814867762687, + "grad_norm": 0.6733908653259277, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0487, + "step": 17980 + }, + { + "epoch": 0.5143674052894925, + "grad_norm": 0.43711504340171814, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0378, + "step": 17990 + }, + { + "epoch": 0.5146533238027162, + "grad_norm": 0.6371222138404846, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0336, + "step": 18000 + }, + { + "epoch": 0.5149392423159399, + "grad_norm": 0.8007041811943054, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0371, + "step": 18010 + }, + { + "epoch": 0.5152251608291637, + "grad_norm": 0.4725078344345093, + "learning_rate": 8.574400723012433e-06, + "loss": 0.037, + "step": 18020 + }, + { + "epoch": 0.5155110793423874, + "grad_norm": 0.34229791164398193, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0353, + "step": 18030 + }, + { + "epoch": 0.5157969978556112, + "grad_norm": 0.27863454818725586, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0371, + "step": 18040 + }, + { + "epoch": 0.5160829163688349, + "grad_norm": 0.43021920323371887, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0419, + "step": 18050 + }, + { + "epoch": 0.5163688348820586, + "grad_norm": 0.4683758318424225, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0307, + "step": 18060 + }, + { + "epoch": 0.5166547533952823, + "grad_norm": 0.29085367918014526, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0372, + "step": 18070 + }, + { + "epoch": 0.5169406719085061, + "grad_norm": 0.4396727681159973, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0328, + "step": 18080 + }, + { + "epoch": 0.5172265904217298, + "grad_norm": 0.539021372795105, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0317, + "step": 18090 + }, + { + "epoch": 0.5175125089349535, + "grad_norm": 0.556974470615387, + "learning_rate": 8.499380733111628e-06, + "loss": 0.037, + "step": 18100 + }, + { + "epoch": 0.5177984274481773, + "grad_norm": 0.4445747137069702, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0344, + "step": 18110 + }, + { + "epoch": 0.518084345961401, + "grad_norm": 0.3742713928222656, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0339, + "step": 18120 + }, + { + "epoch": 0.5183702644746248, + "grad_norm": 0.8467416167259216, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0409, + "step": 18130 + }, + { + "epoch": 0.5186561829878484, + "grad_norm": 0.7731484770774841, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0379, + "step": 18140 + }, + { + "epoch": 0.5189421015010722, + "grad_norm": 0.5664084553718567, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0353, + "step": 18150 + }, + { + "epoch": 0.5192280200142959, + "grad_norm": 0.5623966455459595, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0412, + "step": 18160 + }, + { + "epoch": 0.5195139385275197, + "grad_norm": 0.5074556469917297, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0402, + "step": 18170 + }, + { + "epoch": 0.5197998570407434, + "grad_norm": 0.49439728260040283, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0341, + "step": 18180 + }, + { + "epoch": 0.5200857755539671, + "grad_norm": 0.5982527136802673, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0308, + "step": 18190 + }, + { + "epoch": 0.5203716940671909, + "grad_norm": 0.7891598343849182, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0437, + "step": 18200 + }, + { + "epoch": 0.5206576125804145, + "grad_norm": 0.7565666437149048, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0381, + "step": 18210 + }, + { + "epoch": 0.5209435310936383, + "grad_norm": 0.33346351981163025, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0454, + "step": 18220 + }, + { + "epoch": 0.521229449606862, + "grad_norm": 0.5885659456253052, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0413, + "step": 18230 + }, + { + "epoch": 0.5215153681200858, + "grad_norm": 0.6487091183662415, + "learning_rate": 8.368551060444755e-06, + "loss": 0.035, + "step": 18240 + }, + { + "epoch": 0.5218012866333095, + "grad_norm": 0.9817430377006531, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0394, + "step": 18250 + }, + { + "epoch": 0.5220872051465333, + "grad_norm": 0.5691193342208862, + "learning_rate": 8.349909816537207e-06, + "loss": 0.041, + "step": 18260 + }, + { + "epoch": 0.522373123659757, + "grad_norm": 0.5326661467552185, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0361, + "step": 18270 + }, + { + "epoch": 0.5226590421729806, + "grad_norm": 0.5536142587661743, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0406, + "step": 18280 + }, + { + "epoch": 0.5229449606862044, + "grad_norm": 0.3482394218444824, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0423, + "step": 18290 + }, + { + "epoch": 0.5232308791994281, + "grad_norm": 0.514914333820343, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0352, + "step": 18300 + }, + { + "epoch": 0.5235167977126519, + "grad_norm": 0.7681404948234558, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0386, + "step": 18310 + }, + { + "epoch": 0.5238027162258756, + "grad_norm": 0.400426983833313, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0333, + "step": 18320 + }, + { + "epoch": 0.5240886347390994, + "grad_norm": 0.4996081590652466, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0381, + "step": 18330 + }, + { + "epoch": 0.5243745532523231, + "grad_norm": 0.5379085540771484, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0348, + "step": 18340 + }, + { + "epoch": 0.5246604717655469, + "grad_norm": 0.4462053179740906, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0307, + "step": 18350 + }, + { + "epoch": 0.5249463902787705, + "grad_norm": 0.7336096167564392, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0345, + "step": 18360 + }, + { + "epoch": 0.5252323087919942, + "grad_norm": 0.6676360368728638, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0346, + "step": 18370 + }, + { + "epoch": 0.525518227305218, + "grad_norm": 0.46608656644821167, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0334, + "step": 18380 + }, + { + "epoch": 0.5258041458184417, + "grad_norm": 0.4906940460205078, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0331, + "step": 18390 + }, + { + "epoch": 0.5260900643316655, + "grad_norm": 0.4200032353401184, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0394, + "step": 18400 + }, + { + "epoch": 0.5263759828448892, + "grad_norm": 0.5663877725601196, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0349, + "step": 18410 + }, + { + "epoch": 0.526661901358113, + "grad_norm": 0.36824384331703186, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0303, + "step": 18420 + }, + { + "epoch": 0.5269478198713367, + "grad_norm": 0.8120076060295105, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0443, + "step": 18430 + }, + { + "epoch": 0.5272337383845604, + "grad_norm": 0.4102472960948944, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0369, + "step": 18440 + }, + { + "epoch": 0.5275196568977841, + "grad_norm": 0.5186526775360107, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0338, + "step": 18450 + }, + { + "epoch": 0.5278055754110078, + "grad_norm": 0.9650108218193054, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0343, + "step": 18460 + }, + { + "epoch": 0.5280914939242316, + "grad_norm": 0.5894375443458557, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0416, + "step": 18470 + }, + { + "epoch": 0.5283774124374553, + "grad_norm": 0.6188816428184509, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0402, + "step": 18480 + }, + { + "epoch": 0.5286633309506791, + "grad_norm": 0.35280847549438477, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0363, + "step": 18490 + }, + { + "epoch": 0.5289492494639028, + "grad_norm": 0.7289313673973083, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0392, + "step": 18500 + }, + { + "epoch": 0.5292351679771266, + "grad_norm": 0.505050778388977, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0329, + "step": 18510 + }, + { + "epoch": 0.5295210864903502, + "grad_norm": 0.7029705047607422, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0344, + "step": 18520 + }, + { + "epoch": 0.529807005003574, + "grad_norm": 0.2958471477031708, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0431, + "step": 18530 + }, + { + "epoch": 0.5300929235167977, + "grad_norm": 0.9649683237075806, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0329, + "step": 18540 + }, + { + "epoch": 0.5303788420300214, + "grad_norm": 0.24733735620975494, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0354, + "step": 18550 + }, + { + "epoch": 0.5306647605432452, + "grad_norm": 0.44838136434555054, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0334, + "step": 18560 + }, + { + "epoch": 0.5309506790564689, + "grad_norm": 0.4505597949028015, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0338, + "step": 18570 + }, + { + "epoch": 0.5312365975696927, + "grad_norm": 0.44188442826271057, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0326, + "step": 18580 + }, + { + "epoch": 0.5315225160829163, + "grad_norm": 0.4539152979850769, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0369, + "step": 18590 + }, + { + "epoch": 0.5318084345961401, + "grad_norm": 0.8311023712158203, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0441, + "step": 18600 + }, + { + "epoch": 0.5320943531093638, + "grad_norm": 0.53764808177948, + "learning_rate": 8.025779439806006e-06, + "loss": 0.037, + "step": 18610 + }, + { + "epoch": 0.5323802716225876, + "grad_norm": 1.2192102670669556, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0369, + "step": 18620 + }, + { + "epoch": 0.5326661901358113, + "grad_norm": 0.5254611968994141, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0371, + "step": 18630 + }, + { + "epoch": 0.532952108649035, + "grad_norm": 0.585709810256958, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0337, + "step": 18640 + }, + { + "epoch": 0.5332380271622588, + "grad_norm": 0.45416259765625, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0425, + "step": 18650 + }, + { + "epoch": 0.5335239456754824, + "grad_norm": 0.3957739472389221, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0354, + "step": 18660 + }, + { + "epoch": 0.5338098641887062, + "grad_norm": 0.6211117506027222, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0347, + "step": 18670 + }, + { + "epoch": 0.5340957827019299, + "grad_norm": 0.49023327231407166, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0308, + "step": 18680 + }, + { + "epoch": 0.5343817012151537, + "grad_norm": 0.5823351144790649, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0351, + "step": 18690 + }, + { + "epoch": 0.5346676197283774, + "grad_norm": 0.6048677563667297, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0382, + "step": 18700 + }, + { + "epoch": 0.5349535382416012, + "grad_norm": 0.5293828845024109, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0329, + "step": 18710 + }, + { + "epoch": 0.5352394567548249, + "grad_norm": 0.5935509204864502, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0388, + "step": 18720 + }, + { + "epoch": 0.5355253752680486, + "grad_norm": 0.8369598388671875, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0421, + "step": 18730 + }, + { + "epoch": 0.5358112937812723, + "grad_norm": 0.6874870657920837, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0385, + "step": 18740 + }, + { + "epoch": 0.536097212294496, + "grad_norm": 0.43511492013931274, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0405, + "step": 18750 + }, + { + "epoch": 0.5363831308077198, + "grad_norm": 0.662755012512207, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0375, + "step": 18760 + }, + { + "epoch": 0.5366690493209435, + "grad_norm": 0.5519852638244629, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0351, + "step": 18770 + }, + { + "epoch": 0.5369549678341673, + "grad_norm": 0.9711637496948242, + "learning_rate": 7.869858673101027e-06, + "loss": 0.038, + "step": 18780 + }, + { + "epoch": 0.537240886347391, + "grad_norm": 0.4944411516189575, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0416, + "step": 18790 + }, + { + "epoch": 0.5375268048606148, + "grad_norm": 0.5257377624511719, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0349, + "step": 18800 + }, + { + "epoch": 0.5378127233738385, + "grad_norm": 0.4833063781261444, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0414, + "step": 18810 + }, + { + "epoch": 0.5380986418870621, + "grad_norm": 0.4496164917945862, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0369, + "step": 18820 + }, + { + "epoch": 0.5383845604002859, + "grad_norm": 0.6939138174057007, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0337, + "step": 18830 + }, + { + "epoch": 0.5386704789135096, + "grad_norm": 0.32579538226127625, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0371, + "step": 18840 + }, + { + "epoch": 0.5389563974267334, + "grad_norm": 0.35594654083251953, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0366, + "step": 18850 + }, + { + "epoch": 0.5392423159399571, + "grad_norm": 0.6114012002944946, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0372, + "step": 18860 + }, + { + "epoch": 0.5395282344531809, + "grad_norm": 0.8492457270622253, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0346, + "step": 18870 + }, + { + "epoch": 0.5398141529664046, + "grad_norm": 0.5214036703109741, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0388, + "step": 18880 + }, + { + "epoch": 0.5401000714796284, + "grad_norm": 0.428671658039093, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0394, + "step": 18890 + }, + { + "epoch": 0.540385989992852, + "grad_norm": 0.6071562767028809, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0371, + "step": 18900 + }, + { + "epoch": 0.5406719085060757, + "grad_norm": 0.41996505856513977, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0334, + "step": 18910 + }, + { + "epoch": 0.5409578270192995, + "grad_norm": 0.5260844826698303, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0362, + "step": 18920 + }, + { + "epoch": 0.5412437455325232, + "grad_norm": 0.43362122774124146, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0325, + "step": 18930 + }, + { + "epoch": 0.541529664045747, + "grad_norm": 0.4597149193286896, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0345, + "step": 18940 + }, + { + "epoch": 0.5418155825589707, + "grad_norm": 0.6667322516441345, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0421, + "step": 18950 + }, + { + "epoch": 0.5421015010721945, + "grad_norm": 0.8998900651931763, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0368, + "step": 18960 + }, + { + "epoch": 0.5423874195854181, + "grad_norm": 0.5075538158416748, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0327, + "step": 18970 + }, + { + "epoch": 0.5426733380986419, + "grad_norm": 0.38445526361465454, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0446, + "step": 18980 + }, + { + "epoch": 0.5429592566118656, + "grad_norm": 0.696186363697052, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0364, + "step": 18990 + }, + { + "epoch": 0.5432451751250893, + "grad_norm": 0.6371187567710876, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0401, + "step": 19000 + }, + { + "epoch": 0.5435310936383131, + "grad_norm": 0.6122881174087524, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0345, + "step": 19010 + }, + { + "epoch": 0.5438170121515368, + "grad_norm": 0.4222267270088196, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0456, + "step": 19020 + }, + { + "epoch": 0.5441029306647606, + "grad_norm": 0.6122517585754395, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0434, + "step": 19030 + }, + { + "epoch": 0.5443888491779842, + "grad_norm": 0.2783992886543274, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0354, + "step": 19040 + }, + { + "epoch": 0.544674767691208, + "grad_norm": 0.6433000564575195, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0321, + "step": 19050 + }, + { + "epoch": 0.5449606862044317, + "grad_norm": 0.6967030167579651, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0394, + "step": 19060 + }, + { + "epoch": 0.5452466047176555, + "grad_norm": 0.4799044132232666, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0329, + "step": 19070 + }, + { + "epoch": 0.5455325232308792, + "grad_norm": 0.633895993232727, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0316, + "step": 19080 + }, + { + "epoch": 0.5458184417441029, + "grad_norm": 0.5601945519447327, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0449, + "step": 19090 + }, + { + "epoch": 0.5461043602573267, + "grad_norm": 0.4917007088661194, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0351, + "step": 19100 + }, + { + "epoch": 0.5463902787705504, + "grad_norm": 0.4813363254070282, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.029, + "step": 19110 + }, + { + "epoch": 0.5466761972837741, + "grad_norm": 0.5359676480293274, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0346, + "step": 19120 + }, + { + "epoch": 0.5469621157969978, + "grad_norm": 0.6500958204269409, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0374, + "step": 19130 + }, + { + "epoch": 0.5472480343102216, + "grad_norm": 0.7708510756492615, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0332, + "step": 19140 + }, + { + "epoch": 0.5475339528234453, + "grad_norm": 0.45693230628967285, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0344, + "step": 19150 + }, + { + "epoch": 0.5478198713366691, + "grad_norm": 0.6046226620674133, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0342, + "step": 19160 + }, + { + "epoch": 0.5481057898498928, + "grad_norm": 0.5253175497055054, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0449, + "step": 19170 + }, + { + "epoch": 0.5483917083631165, + "grad_norm": 0.3790060877799988, + "learning_rate": 7.507267205473318e-06, + "loss": 0.037, + "step": 19180 + }, + { + "epoch": 0.5486776268763403, + "grad_norm": 0.37709203362464905, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0346, + "step": 19190 + }, + { + "epoch": 0.5489635453895639, + "grad_norm": 0.3940931558609009, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0427, + "step": 19200 + }, + { + "epoch": 0.5492494639027877, + "grad_norm": 0.761299192905426, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0353, + "step": 19210 + }, + { + "epoch": 0.5495353824160114, + "grad_norm": 0.5268495082855225, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0328, + "step": 19220 + }, + { + "epoch": 0.5498213009292352, + "grad_norm": 0.45624151825904846, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0353, + "step": 19230 + }, + { + "epoch": 0.5501072194424589, + "grad_norm": 0.5374972224235535, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0345, + "step": 19240 + }, + { + "epoch": 0.5503931379556827, + "grad_norm": 0.49830907583236694, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0328, + "step": 19250 + }, + { + "epoch": 0.5506790564689064, + "grad_norm": 0.6223296523094177, + "learning_rate": 7.435514206212475e-06, + "loss": 0.037, + "step": 19260 + }, + { + "epoch": 0.55096497498213, + "grad_norm": 0.42801398038864136, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0371, + "step": 19270 + }, + { + "epoch": 0.5512508934953538, + "grad_norm": 0.3872825801372528, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0314, + "step": 19280 + }, + { + "epoch": 0.5515368120085775, + "grad_norm": 0.3967494070529938, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0349, + "step": 19290 + }, + { + "epoch": 0.5518227305218013, + "grad_norm": 0.42383769154548645, + "learning_rate": 7.399737764864619e-06, + "loss": 0.045, + "step": 19300 + }, + { + "epoch": 0.552108649035025, + "grad_norm": 0.48501884937286377, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0373, + "step": 19310 + }, + { + "epoch": 0.5523945675482488, + "grad_norm": 0.3783693015575409, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0334, + "step": 19320 + }, + { + "epoch": 0.5526804860614725, + "grad_norm": 0.5733019709587097, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0369, + "step": 19330 + }, + { + "epoch": 0.5529664045746963, + "grad_norm": 0.5022825002670288, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0375, + "step": 19340 + }, + { + "epoch": 0.5532523230879199, + "grad_norm": 0.5508015155792236, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0415, + "step": 19350 + }, + { + "epoch": 0.5535382416011436, + "grad_norm": 0.5692425966262817, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0401, + "step": 19360 + }, + { + "epoch": 0.5538241601143674, + "grad_norm": 0.7247840762138367, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0373, + "step": 19370 + }, + { + "epoch": 0.5541100786275911, + "grad_norm": 0.633986234664917, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0435, + "step": 19380 + }, + { + "epoch": 0.5543959971408149, + "grad_norm": 0.8598711490631104, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0424, + "step": 19390 + }, + { + "epoch": 0.5546819156540386, + "grad_norm": 0.782328188419342, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0424, + "step": 19400 + }, + { + "epoch": 0.5549678341672624, + "grad_norm": 0.48890456557273865, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0351, + "step": 19410 + }, + { + "epoch": 0.555253752680486, + "grad_norm": 0.4759981036186218, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0395, + "step": 19420 + }, + { + "epoch": 0.5555396711937098, + "grad_norm": 0.6431323885917664, + "learning_rate": 7.283934675167239e-06, + "loss": 0.036, + "step": 19430 + }, + { + "epoch": 0.5558255897069335, + "grad_norm": 0.6633809208869934, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0333, + "step": 19440 + }, + { + "epoch": 0.5561115082201572, + "grad_norm": 0.3405994772911072, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0375, + "step": 19450 + }, + { + "epoch": 0.556397426733381, + "grad_norm": 0.3443987965583801, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0329, + "step": 19460 + }, + { + "epoch": 0.5566833452466047, + "grad_norm": 0.7973398566246033, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0412, + "step": 19470 + }, + { + "epoch": 0.5569692637598285, + "grad_norm": 0.43843239545822144, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0302, + "step": 19480 + }, + { + "epoch": 0.5572551822730522, + "grad_norm": 0.6797782182693481, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0401, + "step": 19490 + }, + { + "epoch": 0.557541100786276, + "grad_norm": 0.5020610690116882, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0329, + "step": 19500 + }, + { + "epoch": 0.5578270192994996, + "grad_norm": 0.5093050003051758, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0381, + "step": 19510 + }, + { + "epoch": 0.5581129378127234, + "grad_norm": 0.6136947870254517, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0373, + "step": 19520 + }, + { + "epoch": 0.5583988563259471, + "grad_norm": 0.4213317930698395, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0338, + "step": 19530 + }, + { + "epoch": 0.5586847748391708, + "grad_norm": 0.6560636162757874, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0332, + "step": 19540 + }, + { + "epoch": 0.5589706933523946, + "grad_norm": 0.41303765773773193, + "learning_rate": 7.177693135871202e-06, + "loss": 0.03, + "step": 19550 + }, + { + "epoch": 0.5592566118656183, + "grad_norm": 0.5260538458824158, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0328, + "step": 19560 + }, + { + "epoch": 0.559542530378842, + "grad_norm": 0.6076327562332153, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0428, + "step": 19570 + }, + { + "epoch": 0.5598284488920657, + "grad_norm": 0.635111927986145, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0318, + "step": 19580 + }, + { + "epoch": 0.5601143674052895, + "grad_norm": 0.7933056354522705, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0357, + "step": 19590 + }, + { + "epoch": 0.5604002859185132, + "grad_norm": 0.44312241673469543, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0289, + "step": 19600 + }, + { + "epoch": 0.560686204431737, + "grad_norm": 0.36346134543418884, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0354, + "step": 19610 + }, + { + "epoch": 0.5609721229449607, + "grad_norm": 0.49605289101600647, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0367, + "step": 19620 + }, + { + "epoch": 0.5612580414581844, + "grad_norm": 0.7115452289581299, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0354, + "step": 19630 + }, + { + "epoch": 0.5615439599714082, + "grad_norm": 0.650925874710083, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0353, + "step": 19640 + }, + { + "epoch": 0.5618298784846318, + "grad_norm": 0.5046663880348206, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0294, + "step": 19650 + }, + { + "epoch": 0.5621157969978556, + "grad_norm": 0.4441855549812317, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0326, + "step": 19660 + }, + { + "epoch": 0.5624017155110793, + "grad_norm": 0.3956650495529175, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0446, + "step": 19670 + }, + { + "epoch": 0.5626876340243031, + "grad_norm": 0.5384211540222168, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0331, + "step": 19680 + }, + { + "epoch": 0.5629735525375268, + "grad_norm": 0.6183366775512695, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0324, + "step": 19690 + }, + { + "epoch": 0.5632594710507506, + "grad_norm": 0.9116242527961731, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0341, + "step": 19700 + }, + { + "epoch": 0.5635453895639743, + "grad_norm": 0.8171015381813049, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0306, + "step": 19710 + }, + { + "epoch": 0.563831308077198, + "grad_norm": 0.42670243978500366, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0336, + "step": 19720 + }, + { + "epoch": 0.5641172265904217, + "grad_norm": 0.7338811159133911, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0363, + "step": 19730 + }, + { + "epoch": 0.5644031451036454, + "grad_norm": 0.5576338171958923, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0371, + "step": 19740 + }, + { + "epoch": 0.5646890636168692, + "grad_norm": 0.7390629649162292, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0382, + "step": 19750 + }, + { + "epoch": 0.5649749821300929, + "grad_norm": 0.801812469959259, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0379, + "step": 19760 + }, + { + "epoch": 0.5652609006433167, + "grad_norm": 0.5697385668754578, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0369, + "step": 19770 + }, + { + "epoch": 0.5655468191565404, + "grad_norm": 0.4180932343006134, + "learning_rate": 6.975884226362e-06, + "loss": 0.039, + "step": 19780 + }, + { + "epoch": 0.5658327376697642, + "grad_norm": 0.648389995098114, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0346, + "step": 19790 + }, + { + "epoch": 0.5661186561829878, + "grad_norm": 0.9673929214477539, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0392, + "step": 19800 + }, + { + "epoch": 0.5664045746962115, + "grad_norm": 0.4793975353240967, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0321, + "step": 19810 + }, + { + "epoch": 0.5666904932094353, + "grad_norm": 0.5206098556518555, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0319, + "step": 19820 + }, + { + "epoch": 0.566976411722659, + "grad_norm": 0.39929306507110596, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0335, + "step": 19830 + }, + { + "epoch": 0.5672623302358828, + "grad_norm": 0.6819440722465515, + "learning_rate": 6.923644220932124e-06, + "loss": 0.0338, + "step": 19840 + }, + { + "epoch": 0.5675482487491065, + "grad_norm": 0.7612042427062988, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0345, + "step": 19850 + }, + { + "epoch": 0.5678341672623303, + "grad_norm": 0.472676545381546, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0388, + "step": 19860 + }, + { + "epoch": 0.568120085775554, + "grad_norm": 0.48102107644081116, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0304, + "step": 19870 + }, + { + "epoch": 0.5684060042887777, + "grad_norm": 0.4174644649028778, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0315, + "step": 19880 + }, + { + "epoch": 0.5686919228020014, + "grad_norm": 0.4218151271343231, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0413, + "step": 19890 + }, + { + "epoch": 0.5689778413152251, + "grad_norm": 0.8243978023529053, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0399, + "step": 19900 + }, + { + "epoch": 0.5692637598284489, + "grad_norm": 0.400924414396286, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0298, + "step": 19910 + }, + { + "epoch": 0.5695496783416726, + "grad_norm": 0.5199277400970459, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0351, + "step": 19920 + }, + { + "epoch": 0.5698355968548964, + "grad_norm": 0.5238781571388245, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0374, + "step": 19930 + }, + { + "epoch": 0.5701215153681201, + "grad_norm": 0.7451756596565247, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0378, + "step": 19940 + }, + { + "epoch": 0.5704074338813439, + "grad_norm": 0.5029926300048828, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0391, + "step": 19950 + }, + { + "epoch": 0.5706933523945675, + "grad_norm": 0.5532147884368896, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0383, + "step": 19960 + }, + { + "epoch": 0.5709792709077913, + "grad_norm": 0.5694131851196289, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0364, + "step": 19970 + }, + { + "epoch": 0.571265189421015, + "grad_norm": 0.5066515803337097, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0363, + "step": 19980 + }, + { + "epoch": 0.5715511079342387, + "grad_norm": 0.5676470398902893, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0327, + "step": 19990 + }, + { + "epoch": 0.5718370264474625, + "grad_norm": 0.37414318323135376, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0395, + "step": 20000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.24927780847616e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a93d11aa0e7357bd3e30243d2c25b2757dc74a9b --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd6f325619a0e3853df9bac524c57cd5f5b086881217daf8e8e3978bf7c6882d +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..63a89bb7129f20eddbbd5759780779671461c778 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aab9e63301d5c2e25dffdcc629daa9aeeb2a75b6dc4cbfb1058a86914093bcaf +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..144929f0eb4aaeb4528f74b7c3ffc716b1a73b3c --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f22d09a1b0d9884b8e57e29c317a64e43700a2fb9e7fee291d727457b1dfa6 +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0147427ce23ee07a06bd39c939d8b885b149557e --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json @@ -0,0 +1,15434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6290207290922087, + "eval_steps": 500, + "global_step": 22000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + }, + { + "epoch": 0.40057183702644744, + "grad_norm": 0.9077253937721252, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0415, + "step": 14010 + }, + { + "epoch": 0.4008577555396712, + "grad_norm": 0.9126781225204468, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.048, + "step": 14020 + }, + { + "epoch": 0.4011436740528949, + "grad_norm": 0.6264623999595642, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0411, + "step": 14030 + }, + { + "epoch": 0.40142959256611865, + "grad_norm": 0.523853600025177, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.051, + "step": 14040 + }, + { + "epoch": 0.4017155110793424, + "grad_norm": 0.6340035200119019, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0426, + "step": 14050 + }, + { + "epoch": 0.40200142959256613, + "grad_norm": 0.3594725430011749, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0397, + "step": 14060 + }, + { + "epoch": 0.40228734810578987, + "grad_norm": 0.941470742225647, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0402, + "step": 14070 + }, + { + "epoch": 0.4025732666190136, + "grad_norm": 0.840506911277771, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0473, + "step": 14080 + }, + { + "epoch": 0.4028591851322373, + "grad_norm": 0.3359200954437256, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0405, + "step": 14090 + }, + { + "epoch": 0.403145103645461, + "grad_norm": 0.49658629298210144, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0464, + "step": 14100 + }, + { + "epoch": 0.40343102215868476, + "grad_norm": 0.7940187454223633, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0417, + "step": 14110 + }, + { + "epoch": 0.4037169406719085, + "grad_norm": 0.30110660195350647, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0371, + "step": 14120 + }, + { + "epoch": 0.40400285918513223, + "grad_norm": 0.42845240235328674, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.053, + "step": 14130 + }, + { + "epoch": 0.40428877769835597, + "grad_norm": 0.997348427772522, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.041, + "step": 14140 + }, + { + "epoch": 0.4045746962115797, + "grad_norm": 0.4759966731071472, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0377, + "step": 14150 + }, + { + "epoch": 0.40486061472480345, + "grad_norm": 0.42045602202415466, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0397, + "step": 14160 + }, + { + "epoch": 0.4051465332380272, + "grad_norm": 0.6400002837181091, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0507, + "step": 14170 + }, + { + "epoch": 0.40543245175125087, + "grad_norm": 0.5473673939704895, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0359, + "step": 14180 + }, + { + "epoch": 0.4057183702644746, + "grad_norm": 0.7414730787277222, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0416, + "step": 14190 + }, + { + "epoch": 0.40600428877769834, + "grad_norm": 0.4691861867904663, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0363, + "step": 14200 + }, + { + "epoch": 0.4062902072909221, + "grad_norm": 0.9186112880706787, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0445, + "step": 14210 + }, + { + "epoch": 0.4065761258041458, + "grad_norm": 0.6782190203666687, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.40686204431736955, + "grad_norm": 0.6948013305664062, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.037, + "step": 14230 + }, + { + "epoch": 0.4071479628305933, + "grad_norm": 0.3034680485725403, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0371, + "step": 14240 + }, + { + "epoch": 0.40743388134381703, + "grad_norm": 0.4254174828529358, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0449, + "step": 14250 + }, + { + "epoch": 0.40771979985704077, + "grad_norm": 1.3622064590454102, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0428, + "step": 14260 + }, + { + "epoch": 0.40800571837026445, + "grad_norm": 0.5928359031677246, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0424, + "step": 14270 + }, + { + "epoch": 0.4082916368834882, + "grad_norm": 0.9103132486343384, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0414, + "step": 14280 + }, + { + "epoch": 0.4085775553967119, + "grad_norm": 0.6338028311729431, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0376, + "step": 14290 + }, + { + "epoch": 0.40886347390993566, + "grad_norm": 0.9920284748077393, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0393, + "step": 14300 + }, + { + "epoch": 0.4091493924231594, + "grad_norm": 0.411830335855484, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0336, + "step": 14310 + }, + { + "epoch": 0.40943531093638313, + "grad_norm": 0.6977682709693909, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0454, + "step": 14320 + }, + { + "epoch": 0.40972122944960687, + "grad_norm": 0.6303663849830627, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0453, + "step": 14330 + }, + { + "epoch": 0.4100071479628306, + "grad_norm": 0.3048207759857178, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0373, + "step": 14340 + }, + { + "epoch": 0.41029306647605435, + "grad_norm": 0.7683395743370056, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0438, + "step": 14350 + }, + { + "epoch": 0.41057898498927803, + "grad_norm": 0.5791511535644531, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0392, + "step": 14360 + }, + { + "epoch": 0.41086490350250177, + "grad_norm": 0.876626193523407, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0324, + "step": 14370 + }, + { + "epoch": 0.4111508220157255, + "grad_norm": 0.5971815586090088, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0368, + "step": 14380 + }, + { + "epoch": 0.41143674052894924, + "grad_norm": 0.6508862376213074, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0411, + "step": 14390 + }, + { + "epoch": 0.411722659042173, + "grad_norm": 0.4704359471797943, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0351, + "step": 14400 + }, + { + "epoch": 0.4120085775553967, + "grad_norm": 0.4266453683376312, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0367, + "step": 14410 + }, + { + "epoch": 0.41229449606862045, + "grad_norm": 0.5898434519767761, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0376, + "step": 14420 + }, + { + "epoch": 0.4125804145818442, + "grad_norm": 0.8741532564163208, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0419, + "step": 14430 + }, + { + "epoch": 0.41286633309506793, + "grad_norm": 0.24328190088272095, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0333, + "step": 14440 + }, + { + "epoch": 0.4131522516082916, + "grad_norm": 0.4263601303100586, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.039, + "step": 14450 + }, + { + "epoch": 0.41343817012151535, + "grad_norm": 0.6311615109443665, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0454, + "step": 14460 + }, + { + "epoch": 0.4137240886347391, + "grad_norm": 0.7424519658088684, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0392, + "step": 14470 + }, + { + "epoch": 0.4140100071479628, + "grad_norm": 0.48323145508766174, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0374, + "step": 14480 + }, + { + "epoch": 0.41429592566118656, + "grad_norm": 0.38597407937049866, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0393, + "step": 14490 + }, + { + "epoch": 0.4145818441744103, + "grad_norm": 0.7251518964767456, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0431, + "step": 14500 + }, + { + "epoch": 0.41486776268763403, + "grad_norm": 0.44361060857772827, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0426, + "step": 14510 + }, + { + "epoch": 0.41515368120085777, + "grad_norm": 0.5625014305114746, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0372, + "step": 14520 + }, + { + "epoch": 0.4154395997140815, + "grad_norm": 0.27855798602104187, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 0.4157255182273052, + "grad_norm": 0.5966296195983887, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0387, + "step": 14540 + }, + { + "epoch": 0.41601143674052893, + "grad_norm": 0.49445512890815735, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0355, + "step": 14550 + }, + { + "epoch": 0.41629735525375267, + "grad_norm": 0.3813278377056122, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0456, + "step": 14560 + }, + { + "epoch": 0.4165832737669764, + "grad_norm": 0.5962988138198853, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0401, + "step": 14570 + }, + { + "epoch": 0.41686919228020014, + "grad_norm": 0.4028547406196594, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0371, + "step": 14580 + }, + { + "epoch": 0.4171551107934239, + "grad_norm": 1.348706841468811, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0426, + "step": 14590 + }, + { + "epoch": 0.4174410293066476, + "grad_norm": 1.2782070636749268, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0393, + "step": 14600 + }, + { + "epoch": 0.41772694781987135, + "grad_norm": 1.0024999380111694, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0436, + "step": 14610 + }, + { + "epoch": 0.4180128663330951, + "grad_norm": 0.35450127720832825, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0411, + "step": 14620 + }, + { + "epoch": 0.41829878484631877, + "grad_norm": 0.5827250480651855, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0372, + "step": 14630 + }, + { + "epoch": 0.4185847033595425, + "grad_norm": 0.5905774235725403, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0394, + "step": 14640 + }, + { + "epoch": 0.41887062187276625, + "grad_norm": 0.652074933052063, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0405, + "step": 14650 + }, + { + "epoch": 0.41915654038599, + "grad_norm": 0.7245490550994873, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0473, + "step": 14660 + }, + { + "epoch": 0.4194424588992137, + "grad_norm": 0.5153012871742249, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.043, + "step": 14670 + }, + { + "epoch": 0.41972837741243746, + "grad_norm": 0.516107976436615, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0434, + "step": 14680 + }, + { + "epoch": 0.4200142959256612, + "grad_norm": 0.4743354618549347, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0429, + "step": 14690 + }, + { + "epoch": 0.42030021443888493, + "grad_norm": 0.547875165939331, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0395, + "step": 14700 + }, + { + "epoch": 0.42058613295210867, + "grad_norm": 0.6398400068283081, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0384, + "step": 14710 + }, + { + "epoch": 0.42087205146533235, + "grad_norm": 0.5891467332839966, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0399, + "step": 14720 + }, + { + "epoch": 0.4211579699785561, + "grad_norm": 0.3927595615386963, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0353, + "step": 14730 + }, + { + "epoch": 0.42144388849177983, + "grad_norm": 0.6477030515670776, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0492, + "step": 14740 + }, + { + "epoch": 0.42172980700500357, + "grad_norm": 0.7090615034103394, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.042, + "step": 14750 + }, + { + "epoch": 0.4220157255182273, + "grad_norm": 0.6572134494781494, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0406, + "step": 14760 + }, + { + "epoch": 0.42230164403145104, + "grad_norm": 0.787663996219635, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0424, + "step": 14770 + }, + { + "epoch": 0.4225875625446748, + "grad_norm": 0.8419309258460999, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0427, + "step": 14780 + }, + { + "epoch": 0.4228734810578985, + "grad_norm": 0.6204128861427307, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0364, + "step": 14790 + }, + { + "epoch": 0.42315939957112225, + "grad_norm": 0.7446070313453674, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 0.42344531808434593, + "grad_norm": 0.7446451783180237, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0384, + "step": 14810 + }, + { + "epoch": 0.42373123659756967, + "grad_norm": 0.6946475505828857, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0375, + "step": 14820 + }, + { + "epoch": 0.4240171551107934, + "grad_norm": 0.6997008323669434, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0393, + "step": 14830 + }, + { + "epoch": 0.42430307362401715, + "grad_norm": 0.4857316315174103, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0474, + "step": 14840 + }, + { + "epoch": 0.4245889921372409, + "grad_norm": 1.3516888618469238, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.047, + "step": 14850 + }, + { + "epoch": 0.4248749106504646, + "grad_norm": 0.40320220589637756, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0418, + "step": 14860 + }, + { + "epoch": 0.42516082916368836, + "grad_norm": 0.9002796411514282, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0434, + "step": 14870 + }, + { + "epoch": 0.4254467476769121, + "grad_norm": 0.3810071349143982, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0338, + "step": 14880 + }, + { + "epoch": 0.42573266619013583, + "grad_norm": 0.5786157250404358, + "learning_rate": 1.159527607963768e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.4260185847033595, + "grad_norm": 0.6316869258880615, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0388, + "step": 14900 + }, + { + "epoch": 0.42630450321658325, + "grad_norm": 0.608745276927948, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0426, + "step": 14910 + }, + { + "epoch": 0.426590421729807, + "grad_norm": 0.6655036807060242, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0433, + "step": 14920 + }, + { + "epoch": 0.4268763402430307, + "grad_norm": 0.29059523344039917, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0507, + "step": 14930 + }, + { + "epoch": 0.42716225875625446, + "grad_norm": 0.9066076278686523, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.4274481772694782, + "grad_norm": 1.0660220384597778, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0512, + "step": 14950 + }, + { + "epoch": 0.42773409578270194, + "grad_norm": 0.6081144213676453, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0426, + "step": 14960 + }, + { + "epoch": 0.4280200142959257, + "grad_norm": 0.46524369716644287, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0435, + "step": 14970 + }, + { + "epoch": 0.4283059328091494, + "grad_norm": 0.3497388958930969, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0492, + "step": 14980 + }, + { + "epoch": 0.4285918513223731, + "grad_norm": 0.41300803422927856, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 0.42887776983559683, + "grad_norm": 0.4363289177417755, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0358, + "step": 15000 + }, + { + "epoch": 0.42916368834882057, + "grad_norm": 1.314915418624878, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.047, + "step": 15010 + }, + { + "epoch": 0.4294496068620443, + "grad_norm": 0.558199942111969, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0313, + "step": 15020 + }, + { + "epoch": 0.42973552537526805, + "grad_norm": 0.3857463598251343, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0416, + "step": 15030 + }, + { + "epoch": 0.4300214438884918, + "grad_norm": 0.4701749384403229, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0425, + "step": 15040 + }, + { + "epoch": 0.4303073624017155, + "grad_norm": 0.4611213803291321, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0457, + "step": 15050 + }, + { + "epoch": 0.43059328091493926, + "grad_norm": 0.5338016152381897, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.038, + "step": 15060 + }, + { + "epoch": 0.430879199428163, + "grad_norm": 0.9078943133354187, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0395, + "step": 15070 + }, + { + "epoch": 0.4311651179413867, + "grad_norm": 0.5354048013687134, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0403, + "step": 15080 + }, + { + "epoch": 0.4314510364546104, + "grad_norm": 0.35511279106140137, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0377, + "step": 15090 + }, + { + "epoch": 0.43173695496783415, + "grad_norm": 0.37104350328445435, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0426, + "step": 15100 + }, + { + "epoch": 0.4320228734810579, + "grad_norm": 0.8916210532188416, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0387, + "step": 15110 + }, + { + "epoch": 0.4323087919942816, + "grad_norm": 0.514994740486145, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0384, + "step": 15120 + }, + { + "epoch": 0.43259471050750536, + "grad_norm": 0.8440690040588379, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0437, + "step": 15130 + }, + { + "epoch": 0.4328806290207291, + "grad_norm": 0.6815949082374573, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0453, + "step": 15140 + }, + { + "epoch": 0.43316654753395284, + "grad_norm": 0.33178189396858215, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0351, + "step": 15150 + }, + { + "epoch": 0.4334524660471766, + "grad_norm": 0.5686727166175842, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0368, + "step": 15160 + }, + { + "epoch": 0.43373838456040026, + "grad_norm": 0.44143930077552795, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0443, + "step": 15170 + }, + { + "epoch": 0.434024303073624, + "grad_norm": 0.3238232135772705, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0348, + "step": 15180 + }, + { + "epoch": 0.43431022158684773, + "grad_norm": 0.5038242340087891, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0343, + "step": 15190 + }, + { + "epoch": 0.43459614010007147, + "grad_norm": 0.4904351234436035, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0397, + "step": 15200 + }, + { + "epoch": 0.4348820586132952, + "grad_norm": 0.5325750708580017, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0499, + "step": 15210 + }, + { + "epoch": 0.43516797712651895, + "grad_norm": 0.39443954825401306, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 0.4354538956397427, + "grad_norm": 0.6782003045082092, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0358, + "step": 15230 + }, + { + "epoch": 0.4357398141529664, + "grad_norm": 0.47862571477890015, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0418, + "step": 15240 + }, + { + "epoch": 0.43602573266619016, + "grad_norm": 1.6515535116195679, + "learning_rate": 1.124468908014616e-05, + "loss": 0.043, + "step": 15250 + }, + { + "epoch": 0.43631165117941384, + "grad_norm": 0.4902660846710205, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0371, + "step": 15260 + }, + { + "epoch": 0.4365975696926376, + "grad_norm": 0.5742762088775635, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0369, + "step": 15270 + }, + { + "epoch": 0.4368834882058613, + "grad_norm": 0.42058590054512024, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0378, + "step": 15280 + }, + { + "epoch": 0.43716940671908505, + "grad_norm": 0.43729284405708313, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0352, + "step": 15290 + }, + { + "epoch": 0.4374553252323088, + "grad_norm": 0.4689466953277588, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0433, + "step": 15300 + }, + { + "epoch": 0.4377412437455325, + "grad_norm": 0.6272432208061218, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0548, + "step": 15310 + }, + { + "epoch": 0.43802716225875626, + "grad_norm": 1.1129611730575562, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0437, + "step": 15320 + }, + { + "epoch": 0.43831308077198, + "grad_norm": 0.9332655072212219, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0503, + "step": 15330 + }, + { + "epoch": 0.43859899928520374, + "grad_norm": 0.35150477290153503, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0351, + "step": 15340 + }, + { + "epoch": 0.4388849177984274, + "grad_norm": 0.3826565444469452, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0361, + "step": 15350 + }, + { + "epoch": 0.43917083631165116, + "grad_norm": 0.817319393157959, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0352, + "step": 15360 + }, + { + "epoch": 0.4394567548248749, + "grad_norm": 0.4379598796367645, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0469, + "step": 15370 + }, + { + "epoch": 0.43974267333809863, + "grad_norm": 0.6475314497947693, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0456, + "step": 15380 + }, + { + "epoch": 0.44002859185132237, + "grad_norm": 0.529088020324707, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0453, + "step": 15390 + }, + { + "epoch": 0.4403145103645461, + "grad_norm": 0.4915194809436798, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0369, + "step": 15400 + }, + { + "epoch": 0.44060042887776985, + "grad_norm": 0.4766380786895752, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0391, + "step": 15410 + }, + { + "epoch": 0.4408863473909936, + "grad_norm": 0.34667786955833435, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0327, + "step": 15420 + }, + { + "epoch": 0.4411722659042173, + "grad_norm": 0.504242479801178, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0413, + "step": 15430 + }, + { + "epoch": 0.441458184417441, + "grad_norm": 0.49786439538002014, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0361, + "step": 15440 + }, + { + "epoch": 0.44174410293066474, + "grad_norm": 0.4997329115867615, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0368, + "step": 15450 + }, + { + "epoch": 0.4420300214438885, + "grad_norm": 0.2992185056209564, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0359, + "step": 15460 + }, + { + "epoch": 0.4423159399571122, + "grad_norm": 0.6645393371582031, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0401, + "step": 15470 + }, + { + "epoch": 0.44260185847033595, + "grad_norm": 0.6327983140945435, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0386, + "step": 15480 + }, + { + "epoch": 0.4428877769835597, + "grad_norm": 0.45607903599739075, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 0.4431736954967834, + "grad_norm": 0.4401610493659973, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0417, + "step": 15500 + }, + { + "epoch": 0.44345961401000716, + "grad_norm": 0.5778466463088989, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0417, + "step": 15510 + }, + { + "epoch": 0.4437455325232309, + "grad_norm": 0.2164914309978485, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0355, + "step": 15520 + }, + { + "epoch": 0.4440314510364546, + "grad_norm": 0.3869318664073944, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0361, + "step": 15530 + }, + { + "epoch": 0.4443173695496783, + "grad_norm": 0.3843154311180115, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0459, + "step": 15540 + }, + { + "epoch": 0.44460328806290206, + "grad_norm": 0.8488825559616089, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0406, + "step": 15550 + }, + { + "epoch": 0.4448892065761258, + "grad_norm": 0.5055183172225952, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0359, + "step": 15560 + }, + { + "epoch": 0.44517512508934953, + "grad_norm": 0.40923011302948, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0435, + "step": 15570 + }, + { + "epoch": 0.44546104360257327, + "grad_norm": 0.48997730016708374, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0395, + "step": 15580 + }, + { + "epoch": 0.445746962115797, + "grad_norm": 0.5149131417274475, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.041, + "step": 15590 + }, + { + "epoch": 0.44603288062902074, + "grad_norm": 0.7277303338050842, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0452, + "step": 15600 + }, + { + "epoch": 0.4463187991422445, + "grad_norm": 0.48676377534866333, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0363, + "step": 15610 + }, + { + "epoch": 0.44660471765546816, + "grad_norm": 0.49031221866607666, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0356, + "step": 15620 + }, + { + "epoch": 0.4468906361686919, + "grad_norm": 0.38877514004707336, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.036, + "step": 15630 + }, + { + "epoch": 0.44717655468191564, + "grad_norm": 0.570068895816803, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0403, + "step": 15640 + }, + { + "epoch": 0.4474624731951394, + "grad_norm": 0.48499882221221924, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0395, + "step": 15650 + }, + { + "epoch": 0.4477483917083631, + "grad_norm": 0.7251732349395752, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0399, + "step": 15660 + }, + { + "epoch": 0.44803431022158685, + "grad_norm": 0.3927334249019623, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0359, + "step": 15670 + }, + { + "epoch": 0.4483202287348106, + "grad_norm": 0.5614549517631531, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.035, + "step": 15680 + }, + { + "epoch": 0.4486061472480343, + "grad_norm": 0.383831262588501, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0416, + "step": 15690 + }, + { + "epoch": 0.44889206576125806, + "grad_norm": 1.9365276098251343, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0498, + "step": 15700 + }, + { + "epoch": 0.44917798427448175, + "grad_norm": 0.6964924931526184, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.034, + "step": 15710 + }, + { + "epoch": 0.4494639027877055, + "grad_norm": 0.5148108601570129, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0401, + "step": 15720 + }, + { + "epoch": 0.4497498213009292, + "grad_norm": 0.4529317617416382, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0361, + "step": 15730 + }, + { + "epoch": 0.45003573981415296, + "grad_norm": 0.6648512482643127, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0365, + "step": 15740 + }, + { + "epoch": 0.4503216583273767, + "grad_norm": 0.8183113932609558, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0416, + "step": 15750 + }, + { + "epoch": 0.45060757684060043, + "grad_norm": 0.8802638649940491, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0406, + "step": 15760 + }, + { + "epoch": 0.45089349535382417, + "grad_norm": 0.6329004764556885, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0395, + "step": 15770 + }, + { + "epoch": 0.4511794138670479, + "grad_norm": 0.35283520817756653, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0364, + "step": 15780 + }, + { + "epoch": 0.45146533238027164, + "grad_norm": 0.5156061053276062, + "learning_rate": 1.071827766589186e-05, + "loss": 0.031, + "step": 15790 + }, + { + "epoch": 0.4517512508934953, + "grad_norm": 0.37875205278396606, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0375, + "step": 15800 + }, + { + "epoch": 0.45203716940671906, + "grad_norm": 0.5543273687362671, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0421, + "step": 15810 + }, + { + "epoch": 0.4523230879199428, + "grad_norm": 0.3808431923389435, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 0.45260900643316654, + "grad_norm": 0.8648643493652344, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0396, + "step": 15830 + }, + { + "epoch": 0.4528949249463903, + "grad_norm": 0.7893536686897278, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0417, + "step": 15840 + }, + { + "epoch": 0.453180843459614, + "grad_norm": 0.904137134552002, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0384, + "step": 15850 + }, + { + "epoch": 0.45346676197283775, + "grad_norm": 0.6095889806747437, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0457, + "step": 15860 + }, + { + "epoch": 0.4537526804860615, + "grad_norm": 0.5691415667533875, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0438, + "step": 15870 + }, + { + "epoch": 0.4540385989992852, + "grad_norm": 0.37868618965148926, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0414, + "step": 15880 + }, + { + "epoch": 0.4543245175125089, + "grad_norm": 0.7962950468063354, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0405, + "step": 15890 + }, + { + "epoch": 0.45461043602573264, + "grad_norm": 0.8862378597259521, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0475, + "step": 15900 + }, + { + "epoch": 0.4548963545389564, + "grad_norm": 0.8762509822845459, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0472, + "step": 15910 + }, + { + "epoch": 0.4551822730521801, + "grad_norm": 0.6006313562393188, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0417, + "step": 15920 + }, + { + "epoch": 0.45546819156540386, + "grad_norm": 0.3340131938457489, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0374, + "step": 15930 + }, + { + "epoch": 0.4557541100786276, + "grad_norm": 0.2639552056789398, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0387, + "step": 15940 + }, + { + "epoch": 0.45604002859185133, + "grad_norm": 0.42564907670021057, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0376, + "step": 15950 + }, + { + "epoch": 0.45632594710507507, + "grad_norm": 0.503834068775177, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0344, + "step": 15960 + }, + { + "epoch": 0.4566118656182988, + "grad_norm": 0.5962334871292114, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0379, + "step": 15970 + }, + { + "epoch": 0.4568977841315225, + "grad_norm": 0.3271556794643402, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0361, + "step": 15980 + }, + { + "epoch": 0.4571837026447462, + "grad_norm": 0.5501612424850464, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0356, + "step": 15990 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 1.0399914979934692, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.039, + "step": 16000 + }, + { + "epoch": 0.4577555396711937, + "grad_norm": 0.42251288890838623, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0413, + "step": 16010 + }, + { + "epoch": 0.45804145818441744, + "grad_norm": 0.5694882869720459, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0501, + "step": 16020 + }, + { + "epoch": 0.4583273766976412, + "grad_norm": 0.37367814779281616, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0388, + "step": 16030 + }, + { + "epoch": 0.4586132952108649, + "grad_norm": 0.7947224974632263, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0324, + "step": 16040 + }, + { + "epoch": 0.45889921372408865, + "grad_norm": 0.47871798276901245, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0345, + "step": 16050 + }, + { + "epoch": 0.4591851322373124, + "grad_norm": 1.4443609714508057, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0502, + "step": 16060 + }, + { + "epoch": 0.45947105075053607, + "grad_norm": 0.8326191902160645, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0325, + "step": 16070 + }, + { + "epoch": 0.4597569692637598, + "grad_norm": 0.2887400686740875, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.035, + "step": 16080 + }, + { + "epoch": 0.46004288777698354, + "grad_norm": 0.34353405237197876, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0324, + "step": 16090 + }, + { + "epoch": 0.4603288062902073, + "grad_norm": 0.7319850325584412, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0307, + "step": 16100 + }, + { + "epoch": 0.460614724803431, + "grad_norm": 0.6628556847572327, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0398, + "step": 16110 + }, + { + "epoch": 0.46090064331665476, + "grad_norm": 0.39974722266197205, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.038, + "step": 16120 + }, + { + "epoch": 0.4611865618298785, + "grad_norm": 0.7769339680671692, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0425, + "step": 16130 + }, + { + "epoch": 0.46147248034310223, + "grad_norm": 0.6823691129684448, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.039, + "step": 16140 + }, + { + "epoch": 0.46175839885632597, + "grad_norm": 0.6749460697174072, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0388, + "step": 16150 + }, + { + "epoch": 0.46204431736954965, + "grad_norm": 1.0745635032653809, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0406, + "step": 16160 + }, + { + "epoch": 0.4623302358827734, + "grad_norm": 0.8388734459877014, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0345, + "step": 16170 + }, + { + "epoch": 0.4626161543959971, + "grad_norm": 0.675828218460083, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0355, + "step": 16180 + }, + { + "epoch": 0.46290207290922086, + "grad_norm": 0.9872504472732544, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0374, + "step": 16190 + }, + { + "epoch": 0.4631879914224446, + "grad_norm": 0.4705125689506531, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0416, + "step": 16200 + }, + { + "epoch": 0.46347390993566834, + "grad_norm": 0.43577539920806885, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.041, + "step": 16210 + }, + { + "epoch": 0.4637598284488921, + "grad_norm": 0.6472166180610657, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0372, + "step": 16220 + }, + { + "epoch": 0.4640457469621158, + "grad_norm": 1.0108906030654907, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0464, + "step": 16230 + }, + { + "epoch": 0.46433166547533955, + "grad_norm": 0.6221884489059448, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0396, + "step": 16240 + }, + { + "epoch": 0.46461758398856323, + "grad_norm": 0.7375202178955078, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0365, + "step": 16250 + }, + { + "epoch": 0.46490350250178697, + "grad_norm": 0.5090222358703613, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0404, + "step": 16260 + }, + { + "epoch": 0.4651894210150107, + "grad_norm": 0.5641722679138184, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0424, + "step": 16270 + }, + { + "epoch": 0.46547533952823444, + "grad_norm": 0.3946240246295929, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0433, + "step": 16280 + }, + { + "epoch": 0.4657612580414582, + "grad_norm": 0.525059700012207, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0399, + "step": 16290 + }, + { + "epoch": 0.4660471765546819, + "grad_norm": 0.6106441617012024, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0417, + "step": 16300 + }, + { + "epoch": 0.46633309506790566, + "grad_norm": 0.7064299583435059, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0331, + "step": 16310 + }, + { + "epoch": 0.4666190135811294, + "grad_norm": 0.6251654624938965, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0377, + "step": 16320 + }, + { + "epoch": 0.46690493209435313, + "grad_norm": 0.6626482009887695, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0355, + "step": 16330 + }, + { + "epoch": 0.4671908506075768, + "grad_norm": 0.32827794551849365, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0438, + "step": 16340 + }, + { + "epoch": 0.46747676912080055, + "grad_norm": 1.147644281387329, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.041, + "step": 16350 + }, + { + "epoch": 0.4677626876340243, + "grad_norm": 0.5785626769065857, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0362, + "step": 16360 + }, + { + "epoch": 0.468048606147248, + "grad_norm": 0.7087936401367188, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0364, + "step": 16370 + }, + { + "epoch": 0.46833452466047176, + "grad_norm": 0.7729533314704895, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0357, + "step": 16380 + }, + { + "epoch": 0.4686204431736955, + "grad_norm": 0.9080077409744263, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0445, + "step": 16390 + }, + { + "epoch": 0.46890636168691924, + "grad_norm": 0.5273067355155945, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0395, + "step": 16400 + }, + { + "epoch": 0.469192280200143, + "grad_norm": 0.4801991581916809, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0469, + "step": 16410 + }, + { + "epoch": 0.4694781987133667, + "grad_norm": 0.38060688972473145, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0377, + "step": 16420 + }, + { + "epoch": 0.4697641172265904, + "grad_norm": 1.335648536682129, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0444, + "step": 16430 + }, + { + "epoch": 0.47005003573981413, + "grad_norm": 0.6224690079689026, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0365, + "step": 16440 + }, + { + "epoch": 0.47033595425303787, + "grad_norm": 0.39938899874687195, + "learning_rate": 1.007637577910799e-05, + "loss": 0.037, + "step": 16450 + }, + { + "epoch": 0.4706218727662616, + "grad_norm": 0.47899872064590454, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0371, + "step": 16460 + }, + { + "epoch": 0.47090779127948534, + "grad_norm": 0.8991144895553589, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0337, + "step": 16470 + }, + { + "epoch": 0.4711937097927091, + "grad_norm": 0.6228598356246948, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0388, + "step": 16480 + }, + { + "epoch": 0.4714796283059328, + "grad_norm": 0.41108259558677673, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0378, + "step": 16490 + }, + { + "epoch": 0.47176554681915656, + "grad_norm": 0.722955048084259, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0381, + "step": 16500 + }, + { + "epoch": 0.4720514653323803, + "grad_norm": 0.6090973019599915, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0348, + "step": 16510 + }, + { + "epoch": 0.472337383845604, + "grad_norm": 0.483549565076828, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0456, + "step": 16520 + }, + { + "epoch": 0.4726233023588277, + "grad_norm": 0.4134727418422699, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0444, + "step": 16530 + }, + { + "epoch": 0.47290922087205145, + "grad_norm": 0.4629753530025482, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0382, + "step": 16540 + }, + { + "epoch": 0.4731951393852752, + "grad_norm": 0.8709504008293152, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0384, + "step": 16550 + }, + { + "epoch": 0.4734810578984989, + "grad_norm": 0.683397114276886, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0398, + "step": 16560 + }, + { + "epoch": 0.47376697641172266, + "grad_norm": 0.5743465423583984, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0431, + "step": 16570 + }, + { + "epoch": 0.4740528949249464, + "grad_norm": 1.0080480575561523, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0378, + "step": 16580 + }, + { + "epoch": 0.47433881343817014, + "grad_norm": 0.4668700098991394, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0369, + "step": 16590 + }, + { + "epoch": 0.4746247319513939, + "grad_norm": 0.6005896925926208, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0508, + "step": 16600 + }, + { + "epoch": 0.47491065046461756, + "grad_norm": 0.5788530707359314, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0354, + "step": 16610 + }, + { + "epoch": 0.4751965689778413, + "grad_norm": 0.38784441351890564, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0357, + "step": 16620 + }, + { + "epoch": 0.47548248749106503, + "grad_norm": 0.4809567928314209, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0331, + "step": 16630 + }, + { + "epoch": 0.47576840600428877, + "grad_norm": 0.6647809147834778, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0473, + "step": 16640 + }, + { + "epoch": 0.4760543245175125, + "grad_norm": 0.3968522548675537, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0304, + "step": 16650 + }, + { + "epoch": 0.47634024303073624, + "grad_norm": 0.3258526027202606, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0387, + "step": 16660 + }, + { + "epoch": 0.47662616154396, + "grad_norm": 0.43442079424858093, + "learning_rate": 9.863295834019308e-06, + "loss": 0.04, + "step": 16670 + }, + { + "epoch": 0.4769120800571837, + "grad_norm": 0.36909565329551697, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0351, + "step": 16680 + }, + { + "epoch": 0.47719799857040746, + "grad_norm": 0.5566768050193787, + "learning_rate": 9.843955128197274e-06, + "loss": 0.031, + "step": 16690 + }, + { + "epoch": 0.47748391708363114, + "grad_norm": 0.5705142617225647, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0359, + "step": 16700 + }, + { + "epoch": 0.4777698355968549, + "grad_norm": 0.28931716084480286, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0407, + "step": 16710 + }, + { + "epoch": 0.4780557541100786, + "grad_norm": 0.5509498715400696, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0363, + "step": 16720 + }, + { + "epoch": 0.47834167262330235, + "grad_norm": 0.3564346432685852, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0364, + "step": 16730 + }, + { + "epoch": 0.4786275911365261, + "grad_norm": 0.32734423875808716, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0369, + "step": 16740 + }, + { + "epoch": 0.4789135096497498, + "grad_norm": 0.3048594892024994, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0367, + "step": 16750 + }, + { + "epoch": 0.47919942816297356, + "grad_norm": 0.9007049798965454, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0377, + "step": 16760 + }, + { + "epoch": 0.4794853466761973, + "grad_norm": 0.7010983824729919, + "learning_rate": 9.76664747972605e-06, + "loss": 0.039, + "step": 16770 + }, + { + "epoch": 0.47977126518942104, + "grad_norm": 0.644473135471344, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0466, + "step": 16780 + }, + { + "epoch": 0.4800571837026447, + "grad_norm": 0.6333492398262024, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0373, + "step": 16790 + }, + { + "epoch": 0.48034310221586846, + "grad_norm": 0.5148355960845947, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0392, + "step": 16800 + }, + { + "epoch": 0.4806290207290922, + "grad_norm": 0.7288355231285095, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0381, + "step": 16810 + }, + { + "epoch": 0.48091493924231593, + "grad_norm": 0.3674873113632202, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0418, + "step": 16820 + }, + { + "epoch": 0.48120085775553967, + "grad_norm": 0.5055420398712158, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0336, + "step": 16830 + }, + { + "epoch": 0.4814867762687634, + "grad_norm": 0.641754686832428, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0342, + "step": 16840 + }, + { + "epoch": 0.48177269478198714, + "grad_norm": 0.308200478553772, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0364, + "step": 16850 + }, + { + "epoch": 0.4820586132952109, + "grad_norm": 0.41361021995544434, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0342, + "step": 16860 + }, + { + "epoch": 0.4823445318084346, + "grad_norm": 0.45777833461761475, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0353, + "step": 16870 + }, + { + "epoch": 0.4826304503216583, + "grad_norm": 0.7587664723396301, + "learning_rate": 9.660501900166734e-06, + "loss": 0.043, + "step": 16880 + }, + { + "epoch": 0.48291636883488204, + "grad_norm": 0.8740283250808716, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0372, + "step": 16890 + }, + { + "epoch": 0.4832022873481058, + "grad_norm": 0.3009270429611206, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0373, + "step": 16900 + }, + { + "epoch": 0.4834882058613295, + "grad_norm": 0.4439285695552826, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0349, + "step": 16910 + }, + { + "epoch": 0.48377412437455325, + "grad_norm": 0.39849671721458435, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0394, + "step": 16920 + }, + { + "epoch": 0.484060042887777, + "grad_norm": 0.6423043608665466, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0413, + "step": 16930 + }, + { + "epoch": 0.4843459614010007, + "grad_norm": 0.3683928847312927, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0387, + "step": 16940 + }, + { + "epoch": 0.48463187991422446, + "grad_norm": 0.7087769508361816, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0397, + "step": 16950 + }, + { + "epoch": 0.4849177984274482, + "grad_norm": 0.5348120927810669, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0405, + "step": 16960 + }, + { + "epoch": 0.4852037169406719, + "grad_norm": 0.549891471862793, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0363, + "step": 16970 + }, + { + "epoch": 0.4854896354538956, + "grad_norm": 0.7177272439002991, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0343, + "step": 16980 + }, + { + "epoch": 0.48577555396711936, + "grad_norm": 0.595417320728302, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0439, + "step": 16990 + }, + { + "epoch": 0.4860614724803431, + "grad_norm": 0.4838889241218567, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0387, + "step": 17000 + }, + { + "epoch": 0.48634739099356683, + "grad_norm": 0.6186223030090332, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0362, + "step": 17010 + }, + { + "epoch": 0.48663330950679057, + "grad_norm": 0.43383121490478516, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0381, + "step": 17020 + }, + { + "epoch": 0.4869192280200143, + "grad_norm": 0.6735527515411377, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0388, + "step": 17030 + }, + { + "epoch": 0.48720514653323804, + "grad_norm": 0.3746320605278015, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0491, + "step": 17040 + }, + { + "epoch": 0.4874910650464618, + "grad_norm": 0.29500988125801086, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0395, + "step": 17050 + }, + { + "epoch": 0.48777698355968546, + "grad_norm": 0.8518465757369995, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0435, + "step": 17060 + }, + { + "epoch": 0.4880629020729092, + "grad_norm": 0.9653190970420837, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0393, + "step": 17070 + }, + { + "epoch": 0.48834882058613294, + "grad_norm": 0.785724937915802, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0372, + "step": 17080 + }, + { + "epoch": 0.4886347390993567, + "grad_norm": 0.9450638890266418, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0406, + "step": 17090 + }, + { + "epoch": 0.4889206576125804, + "grad_norm": 0.645124077796936, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0361, + "step": 17100 + }, + { + "epoch": 0.48920657612580415, + "grad_norm": 0.3352372944355011, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0417, + "step": 17110 + }, + { + "epoch": 0.4894924946390279, + "grad_norm": 0.3858814835548401, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0345, + "step": 17120 + }, + { + "epoch": 0.4897784131522516, + "grad_norm": 0.5403604507446289, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0326, + "step": 17130 + }, + { + "epoch": 0.49006433166547536, + "grad_norm": 0.6986777782440186, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0417, + "step": 17140 + }, + { + "epoch": 0.49035025017869904, + "grad_norm": 0.5456675887107849, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0473, + "step": 17150 + }, + { + "epoch": 0.4906361686919228, + "grad_norm": 0.3961554765701294, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0341, + "step": 17160 + }, + { + "epoch": 0.4909220872051465, + "grad_norm": 0.5188277363777161, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0369, + "step": 17170 + }, + { + "epoch": 0.49120800571837026, + "grad_norm": 0.6042230725288391, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0352, + "step": 17180 + }, + { + "epoch": 0.491493924231594, + "grad_norm": 0.5485941171646118, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0405, + "step": 17190 + }, + { + "epoch": 0.49177984274481773, + "grad_norm": 0.5856509804725647, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0402, + "step": 17200 + }, + { + "epoch": 0.49206576125804147, + "grad_norm": 0.8656556010246277, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0349, + "step": 17210 + }, + { + "epoch": 0.4923516797712652, + "grad_norm": 0.4041757583618164, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0364, + "step": 17220 + }, + { + "epoch": 0.49263759828448894, + "grad_norm": 0.6135975122451782, + "learning_rate": 9.324104146177972e-06, + "loss": 0.036, + "step": 17230 + }, + { + "epoch": 0.4929235167977126, + "grad_norm": 0.5101860165596008, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0359, + "step": 17240 + }, + { + "epoch": 0.49320943531093636, + "grad_norm": 0.9913426041603088, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0552, + "step": 17250 + }, + { + "epoch": 0.4934953538241601, + "grad_norm": 0.6148158311843872, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0388, + "step": 17260 + }, + { + "epoch": 0.49378127233738384, + "grad_norm": 0.6651721596717834, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0374, + "step": 17270 + }, + { + "epoch": 0.4940671908506076, + "grad_norm": 0.9545061588287354, + "learning_rate": 9.276232738281744e-06, + "loss": 0.035, + "step": 17280 + }, + { + "epoch": 0.4943531093638313, + "grad_norm": 0.8923225402832031, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0366, + "step": 17290 + }, + { + "epoch": 0.49463902787705505, + "grad_norm": 0.5337848663330078, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0354, + "step": 17300 + }, + { + "epoch": 0.4949249463902788, + "grad_norm": 0.35039281845092773, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0341, + "step": 17310 + }, + { + "epoch": 0.4952108649035025, + "grad_norm": 0.47406911849975586, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0393, + "step": 17320 + }, + { + "epoch": 0.4954967834167262, + "grad_norm": 0.6226631999015808, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0375, + "step": 17330 + }, + { + "epoch": 0.49578270192994994, + "grad_norm": 0.6652712821960449, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0363, + "step": 17340 + }, + { + "epoch": 0.4960686204431737, + "grad_norm": 1.0042835474014282, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0368, + "step": 17350 + }, + { + "epoch": 0.4963545389563974, + "grad_norm": 0.4334045648574829, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0375, + "step": 17360 + }, + { + "epoch": 0.49664045746962115, + "grad_norm": 0.3561633229255676, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0347, + "step": 17370 + }, + { + "epoch": 0.4969263759828449, + "grad_norm": 0.5763550996780396, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0344, + "step": 17380 + }, + { + "epoch": 0.49721229449606863, + "grad_norm": 0.6306643486022949, + "learning_rate": 9.171095634265995e-06, + "loss": 0.037, + "step": 17390 + }, + { + "epoch": 0.49749821300929237, + "grad_norm": 0.4286569058895111, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0308, + "step": 17400 + }, + { + "epoch": 0.4977841315225161, + "grad_norm": 0.577983558177948, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0393, + "step": 17410 + }, + { + "epoch": 0.4980700500357398, + "grad_norm": 0.5714932084083557, + "learning_rate": 9.142466323573853e-06, + "loss": 0.038, + "step": 17420 + }, + { + "epoch": 0.4983559685489635, + "grad_norm": 0.7529498338699341, + "learning_rate": 9.132927564918328e-06, + "loss": 0.033, + "step": 17430 + }, + { + "epoch": 0.49864188706218726, + "grad_norm": 0.5179672241210938, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0367, + "step": 17440 + }, + { + "epoch": 0.498927805575411, + "grad_norm": 0.38424569368362427, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0401, + "step": 17450 + }, + { + "epoch": 0.49921372408863474, + "grad_norm": 0.469460129737854, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0379, + "step": 17460 + }, + { + "epoch": 0.4994996426018585, + "grad_norm": 0.3285387456417084, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0399, + "step": 17470 + }, + { + "epoch": 0.4997855611150822, + "grad_norm": 0.49863550066947937, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0313, + "step": 17480 + }, + { + "epoch": 0.5000714796283059, + "grad_norm": 0.3926186263561249, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0454, + "step": 17490 + }, + { + "epoch": 0.5003573981415297, + "grad_norm": 0.4476146399974823, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0472, + "step": 17500 + }, + { + "epoch": 0.5006433166547534, + "grad_norm": 0.5645599961280823, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0358, + "step": 17510 + }, + { + "epoch": 0.5009292351679772, + "grad_norm": 0.4813307225704193, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0364, + "step": 17520 + }, + { + "epoch": 0.5012151536812008, + "grad_norm": 0.49410971999168396, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0303, + "step": 17530 + }, + { + "epoch": 0.5015010721944246, + "grad_norm": 0.7172105312347412, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0404, + "step": 17540 + }, + { + "epoch": 0.5017869907076483, + "grad_norm": 0.43401873111724854, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0402, + "step": 17550 + }, + { + "epoch": 0.502072909220872, + "grad_norm": 0.6497406363487244, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0364, + "step": 17560 + }, + { + "epoch": 0.5023588277340958, + "grad_norm": 0.44618356227874756, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0337, + "step": 17570 + }, + { + "epoch": 0.5026447462473195, + "grad_norm": 0.4186992049217224, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0381, + "step": 17580 + }, + { + "epoch": 0.5029306647605433, + "grad_norm": 0.7387974858283997, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0319, + "step": 17590 + }, + { + "epoch": 0.503216583273767, + "grad_norm": 0.8068642020225525, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0373, + "step": 17600 + }, + { + "epoch": 0.5035025017869907, + "grad_norm": 0.5773473978042603, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0372, + "step": 17610 + }, + { + "epoch": 0.5037884203002144, + "grad_norm": 0.32488778233528137, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0334, + "step": 17620 + }, + { + "epoch": 0.5040743388134382, + "grad_norm": 0.33978500962257385, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0493, + "step": 17630 + }, + { + "epoch": 0.5043602573266619, + "grad_norm": 0.5897071361541748, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0335, + "step": 17640 + }, + { + "epoch": 0.5046461758398856, + "grad_norm": 0.6275895833969116, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0395, + "step": 17650 + }, + { + "epoch": 0.5049320943531094, + "grad_norm": 0.7995536923408508, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0422, + "step": 17660 + }, + { + "epoch": 0.505218012866333, + "grad_norm": 0.8734716773033142, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0414, + "step": 17670 + }, + { + "epoch": 0.5055039313795568, + "grad_norm": 0.6239343881607056, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0333, + "step": 17680 + }, + { + "epoch": 0.5057898498927805, + "grad_norm": 0.42508623003959656, + "learning_rate": 8.885721609997551e-06, + "loss": 0.045, + "step": 17690 + }, + { + "epoch": 0.5060757684060043, + "grad_norm": 0.4272485673427582, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0506, + "step": 17700 + }, + { + "epoch": 0.506361686919228, + "grad_norm": 0.8006368279457092, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0431, + "step": 17710 + }, + { + "epoch": 0.5066476054324518, + "grad_norm": 0.5896835327148438, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0322, + "step": 17720 + }, + { + "epoch": 0.5069335239456755, + "grad_norm": 0.6880389451980591, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0322, + "step": 17730 + }, + { + "epoch": 0.5072194424588992, + "grad_norm": 1.4850202798843384, + "learning_rate": 8.83836825410936e-06, + "loss": 0.052, + "step": 17740 + }, + { + "epoch": 0.507505360972123, + "grad_norm": 0.7684240937232971, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0353, + "step": 17750 + }, + { + "epoch": 0.5077912794853466, + "grad_norm": 0.5456307530403137, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0419, + "step": 17760 + }, + { + "epoch": 0.5080771979985704, + "grad_norm": 0.5775120258331299, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0366, + "step": 17770 + }, + { + "epoch": 0.5083631165117941, + "grad_norm": 0.6453070044517517, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0341, + "step": 17780 + }, + { + "epoch": 0.5086490350250179, + "grad_norm": 0.7906973361968994, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0405, + "step": 17790 + }, + { + "epoch": 0.5089349535382416, + "grad_norm": 1.0740606784820557, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0344, + "step": 17800 + }, + { + "epoch": 0.5092208720514654, + "grad_norm": 0.41854357719421387, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0334, + "step": 17810 + }, + { + "epoch": 0.5095067905646891, + "grad_norm": 0.6328964233398438, + "learning_rate": 8.762735374981932e-06, + "loss": 0.036, + "step": 17820 + }, + { + "epoch": 0.5097927090779127, + "grad_norm": 0.40875789523124695, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0338, + "step": 17830 + }, + { + "epoch": 0.5100786275911365, + "grad_norm": 0.5056312084197998, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0332, + "step": 17840 + }, + { + "epoch": 0.5103645461043602, + "grad_norm": 0.5005037784576416, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0416, + "step": 17850 + }, + { + "epoch": 0.510650464617584, + "grad_norm": 0.5689167380332947, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0329, + "step": 17860 + }, + { + "epoch": 0.5109363831308077, + "grad_norm": 0.5222717523574829, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0336, + "step": 17870 + }, + { + "epoch": 0.5112223016440315, + "grad_norm": 0.5998329520225525, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0354, + "step": 17880 + }, + { + "epoch": 0.5115082201572552, + "grad_norm": 0.4684480130672455, + "learning_rate": 8.69669425266315e-06, + "loss": 0.05, + "step": 17890 + }, + { + "epoch": 0.511794138670479, + "grad_norm": 0.4061124622821808, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0384, + "step": 17900 + }, + { + "epoch": 0.5120800571837026, + "grad_norm": 0.5025928020477295, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0386, + "step": 17910 + }, + { + "epoch": 0.5123659756969263, + "grad_norm": 0.3731222152709961, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0378, + "step": 17920 + }, + { + "epoch": 0.5126518942101501, + "grad_norm": 0.7784973978996277, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0419, + "step": 17930 + }, + { + "epoch": 0.5129378127233738, + "grad_norm": 0.7074074745178223, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0386, + "step": 17940 + }, + { + "epoch": 0.5132237312365976, + "grad_norm": 0.49802306294441223, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0418, + "step": 17950 + }, + { + "epoch": 0.5135096497498213, + "grad_norm": 0.4355427920818329, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0431, + "step": 17960 + }, + { + "epoch": 0.5137955682630451, + "grad_norm": 0.672635555267334, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0403, + "step": 17970 + }, + { + "epoch": 0.5140814867762687, + "grad_norm": 0.6733908653259277, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0487, + "step": 17980 + }, + { + "epoch": 0.5143674052894925, + "grad_norm": 0.43711504340171814, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0378, + "step": 17990 + }, + { + "epoch": 0.5146533238027162, + "grad_norm": 0.6371222138404846, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0336, + "step": 18000 + }, + { + "epoch": 0.5149392423159399, + "grad_norm": 0.8007041811943054, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0371, + "step": 18010 + }, + { + "epoch": 0.5152251608291637, + "grad_norm": 0.4725078344345093, + "learning_rate": 8.574400723012433e-06, + "loss": 0.037, + "step": 18020 + }, + { + "epoch": 0.5155110793423874, + "grad_norm": 0.34229791164398193, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0353, + "step": 18030 + }, + { + "epoch": 0.5157969978556112, + "grad_norm": 0.27863454818725586, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0371, + "step": 18040 + }, + { + "epoch": 0.5160829163688349, + "grad_norm": 0.43021920323371887, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0419, + "step": 18050 + }, + { + "epoch": 0.5163688348820586, + "grad_norm": 0.4683758318424225, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0307, + "step": 18060 + }, + { + "epoch": 0.5166547533952823, + "grad_norm": 0.29085367918014526, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0372, + "step": 18070 + }, + { + "epoch": 0.5169406719085061, + "grad_norm": 0.4396727681159973, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0328, + "step": 18080 + }, + { + "epoch": 0.5172265904217298, + "grad_norm": 0.539021372795105, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0317, + "step": 18090 + }, + { + "epoch": 0.5175125089349535, + "grad_norm": 0.556974470615387, + "learning_rate": 8.499380733111628e-06, + "loss": 0.037, + "step": 18100 + }, + { + "epoch": 0.5177984274481773, + "grad_norm": 0.4445747137069702, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0344, + "step": 18110 + }, + { + "epoch": 0.518084345961401, + "grad_norm": 0.3742713928222656, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0339, + "step": 18120 + }, + { + "epoch": 0.5183702644746248, + "grad_norm": 0.8467416167259216, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0409, + "step": 18130 + }, + { + "epoch": 0.5186561829878484, + "grad_norm": 0.7731484770774841, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0379, + "step": 18140 + }, + { + "epoch": 0.5189421015010722, + "grad_norm": 0.5664084553718567, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0353, + "step": 18150 + }, + { + "epoch": 0.5192280200142959, + "grad_norm": 0.5623966455459595, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0412, + "step": 18160 + }, + { + "epoch": 0.5195139385275197, + "grad_norm": 0.5074556469917297, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0402, + "step": 18170 + }, + { + "epoch": 0.5197998570407434, + "grad_norm": 0.49439728260040283, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0341, + "step": 18180 + }, + { + "epoch": 0.5200857755539671, + "grad_norm": 0.5982527136802673, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0308, + "step": 18190 + }, + { + "epoch": 0.5203716940671909, + "grad_norm": 0.7891598343849182, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0437, + "step": 18200 + }, + { + "epoch": 0.5206576125804145, + "grad_norm": 0.7565666437149048, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0381, + "step": 18210 + }, + { + "epoch": 0.5209435310936383, + "grad_norm": 0.33346351981163025, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0454, + "step": 18220 + }, + { + "epoch": 0.521229449606862, + "grad_norm": 0.5885659456253052, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0413, + "step": 18230 + }, + { + "epoch": 0.5215153681200858, + "grad_norm": 0.6487091183662415, + "learning_rate": 8.368551060444755e-06, + "loss": 0.035, + "step": 18240 + }, + { + "epoch": 0.5218012866333095, + "grad_norm": 0.9817430377006531, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0394, + "step": 18250 + }, + { + "epoch": 0.5220872051465333, + "grad_norm": 0.5691193342208862, + "learning_rate": 8.349909816537207e-06, + "loss": 0.041, + "step": 18260 + }, + { + "epoch": 0.522373123659757, + "grad_norm": 0.5326661467552185, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0361, + "step": 18270 + }, + { + "epoch": 0.5226590421729806, + "grad_norm": 0.5536142587661743, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0406, + "step": 18280 + }, + { + "epoch": 0.5229449606862044, + "grad_norm": 0.3482394218444824, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0423, + "step": 18290 + }, + { + "epoch": 0.5232308791994281, + "grad_norm": 0.514914333820343, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0352, + "step": 18300 + }, + { + "epoch": 0.5235167977126519, + "grad_norm": 0.7681404948234558, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0386, + "step": 18310 + }, + { + "epoch": 0.5238027162258756, + "grad_norm": 0.400426983833313, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0333, + "step": 18320 + }, + { + "epoch": 0.5240886347390994, + "grad_norm": 0.4996081590652466, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0381, + "step": 18330 + }, + { + "epoch": 0.5243745532523231, + "grad_norm": 0.5379085540771484, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0348, + "step": 18340 + }, + { + "epoch": 0.5246604717655469, + "grad_norm": 0.4462053179740906, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0307, + "step": 18350 + }, + { + "epoch": 0.5249463902787705, + "grad_norm": 0.7336096167564392, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0345, + "step": 18360 + }, + { + "epoch": 0.5252323087919942, + "grad_norm": 0.6676360368728638, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0346, + "step": 18370 + }, + { + "epoch": 0.525518227305218, + "grad_norm": 0.46608656644821167, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0334, + "step": 18380 + }, + { + "epoch": 0.5258041458184417, + "grad_norm": 0.4906940460205078, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0331, + "step": 18390 + }, + { + "epoch": 0.5260900643316655, + "grad_norm": 0.4200032353401184, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0394, + "step": 18400 + }, + { + "epoch": 0.5263759828448892, + "grad_norm": 0.5663877725601196, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0349, + "step": 18410 + }, + { + "epoch": 0.526661901358113, + "grad_norm": 0.36824384331703186, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0303, + "step": 18420 + }, + { + "epoch": 0.5269478198713367, + "grad_norm": 0.8120076060295105, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0443, + "step": 18430 + }, + { + "epoch": 0.5272337383845604, + "grad_norm": 0.4102472960948944, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0369, + "step": 18440 + }, + { + "epoch": 0.5275196568977841, + "grad_norm": 0.5186526775360107, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0338, + "step": 18450 + }, + { + "epoch": 0.5278055754110078, + "grad_norm": 0.9650108218193054, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0343, + "step": 18460 + }, + { + "epoch": 0.5280914939242316, + "grad_norm": 0.5894375443458557, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0416, + "step": 18470 + }, + { + "epoch": 0.5283774124374553, + "grad_norm": 0.6188816428184509, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0402, + "step": 18480 + }, + { + "epoch": 0.5286633309506791, + "grad_norm": 0.35280847549438477, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0363, + "step": 18490 + }, + { + "epoch": 0.5289492494639028, + "grad_norm": 0.7289313673973083, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0392, + "step": 18500 + }, + { + "epoch": 0.5292351679771266, + "grad_norm": 0.505050778388977, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0329, + "step": 18510 + }, + { + "epoch": 0.5295210864903502, + "grad_norm": 0.7029705047607422, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0344, + "step": 18520 + }, + { + "epoch": 0.529807005003574, + "grad_norm": 0.2958471477031708, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0431, + "step": 18530 + }, + { + "epoch": 0.5300929235167977, + "grad_norm": 0.9649683237075806, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0329, + "step": 18540 + }, + { + "epoch": 0.5303788420300214, + "grad_norm": 0.24733735620975494, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0354, + "step": 18550 + }, + { + "epoch": 0.5306647605432452, + "grad_norm": 0.44838136434555054, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0334, + "step": 18560 + }, + { + "epoch": 0.5309506790564689, + "grad_norm": 0.4505597949028015, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0338, + "step": 18570 + }, + { + "epoch": 0.5312365975696927, + "grad_norm": 0.44188442826271057, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0326, + "step": 18580 + }, + { + "epoch": 0.5315225160829163, + "grad_norm": 0.4539152979850769, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0369, + "step": 18590 + }, + { + "epoch": 0.5318084345961401, + "grad_norm": 0.8311023712158203, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0441, + "step": 18600 + }, + { + "epoch": 0.5320943531093638, + "grad_norm": 0.53764808177948, + "learning_rate": 8.025779439806006e-06, + "loss": 0.037, + "step": 18610 + }, + { + "epoch": 0.5323802716225876, + "grad_norm": 1.2192102670669556, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0369, + "step": 18620 + }, + { + "epoch": 0.5326661901358113, + "grad_norm": 0.5254611968994141, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0371, + "step": 18630 + }, + { + "epoch": 0.532952108649035, + "grad_norm": 0.585709810256958, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0337, + "step": 18640 + }, + { + "epoch": 0.5332380271622588, + "grad_norm": 0.45416259765625, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0425, + "step": 18650 + }, + { + "epoch": 0.5335239456754824, + "grad_norm": 0.3957739472389221, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0354, + "step": 18660 + }, + { + "epoch": 0.5338098641887062, + "grad_norm": 0.6211117506027222, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0347, + "step": 18670 + }, + { + "epoch": 0.5340957827019299, + "grad_norm": 0.49023327231407166, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0308, + "step": 18680 + }, + { + "epoch": 0.5343817012151537, + "grad_norm": 0.5823351144790649, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0351, + "step": 18690 + }, + { + "epoch": 0.5346676197283774, + "grad_norm": 0.6048677563667297, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0382, + "step": 18700 + }, + { + "epoch": 0.5349535382416012, + "grad_norm": 0.5293828845024109, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0329, + "step": 18710 + }, + { + "epoch": 0.5352394567548249, + "grad_norm": 0.5935509204864502, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0388, + "step": 18720 + }, + { + "epoch": 0.5355253752680486, + "grad_norm": 0.8369598388671875, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0421, + "step": 18730 + }, + { + "epoch": 0.5358112937812723, + "grad_norm": 0.6874870657920837, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0385, + "step": 18740 + }, + { + "epoch": 0.536097212294496, + "grad_norm": 0.43511492013931274, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0405, + "step": 18750 + }, + { + "epoch": 0.5363831308077198, + "grad_norm": 0.662755012512207, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0375, + "step": 18760 + }, + { + "epoch": 0.5366690493209435, + "grad_norm": 0.5519852638244629, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0351, + "step": 18770 + }, + { + "epoch": 0.5369549678341673, + "grad_norm": 0.9711637496948242, + "learning_rate": 7.869858673101027e-06, + "loss": 0.038, + "step": 18780 + }, + { + "epoch": 0.537240886347391, + "grad_norm": 0.4944411516189575, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0416, + "step": 18790 + }, + { + "epoch": 0.5375268048606148, + "grad_norm": 0.5257377624511719, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0349, + "step": 18800 + }, + { + "epoch": 0.5378127233738385, + "grad_norm": 0.4833063781261444, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0414, + "step": 18810 + }, + { + "epoch": 0.5380986418870621, + "grad_norm": 0.4496164917945862, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0369, + "step": 18820 + }, + { + "epoch": 0.5383845604002859, + "grad_norm": 0.6939138174057007, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0337, + "step": 18830 + }, + { + "epoch": 0.5386704789135096, + "grad_norm": 0.32579538226127625, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0371, + "step": 18840 + }, + { + "epoch": 0.5389563974267334, + "grad_norm": 0.35594654083251953, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0366, + "step": 18850 + }, + { + "epoch": 0.5392423159399571, + "grad_norm": 0.6114012002944946, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0372, + "step": 18860 + }, + { + "epoch": 0.5395282344531809, + "grad_norm": 0.8492457270622253, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0346, + "step": 18870 + }, + { + "epoch": 0.5398141529664046, + "grad_norm": 0.5214036703109741, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0388, + "step": 18880 + }, + { + "epoch": 0.5401000714796284, + "grad_norm": 0.428671658039093, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0394, + "step": 18890 + }, + { + "epoch": 0.540385989992852, + "grad_norm": 0.6071562767028809, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0371, + "step": 18900 + }, + { + "epoch": 0.5406719085060757, + "grad_norm": 0.41996505856513977, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0334, + "step": 18910 + }, + { + "epoch": 0.5409578270192995, + "grad_norm": 0.5260844826698303, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0362, + "step": 18920 + }, + { + "epoch": 0.5412437455325232, + "grad_norm": 0.43362122774124146, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0325, + "step": 18930 + }, + { + "epoch": 0.541529664045747, + "grad_norm": 0.4597149193286896, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0345, + "step": 18940 + }, + { + "epoch": 0.5418155825589707, + "grad_norm": 0.6667322516441345, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0421, + "step": 18950 + }, + { + "epoch": 0.5421015010721945, + "grad_norm": 0.8998900651931763, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0368, + "step": 18960 + }, + { + "epoch": 0.5423874195854181, + "grad_norm": 0.5075538158416748, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0327, + "step": 18970 + }, + { + "epoch": 0.5426733380986419, + "grad_norm": 0.38445526361465454, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0446, + "step": 18980 + }, + { + "epoch": 0.5429592566118656, + "grad_norm": 0.696186363697052, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0364, + "step": 18990 + }, + { + "epoch": 0.5432451751250893, + "grad_norm": 0.6371187567710876, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0401, + "step": 19000 + }, + { + "epoch": 0.5435310936383131, + "grad_norm": 0.6122881174087524, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0345, + "step": 19010 + }, + { + "epoch": 0.5438170121515368, + "grad_norm": 0.4222267270088196, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0456, + "step": 19020 + }, + { + "epoch": 0.5441029306647606, + "grad_norm": 0.6122517585754395, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0434, + "step": 19030 + }, + { + "epoch": 0.5443888491779842, + "grad_norm": 0.2783992886543274, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0354, + "step": 19040 + }, + { + "epoch": 0.544674767691208, + "grad_norm": 0.6433000564575195, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0321, + "step": 19050 + }, + { + "epoch": 0.5449606862044317, + "grad_norm": 0.6967030167579651, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0394, + "step": 19060 + }, + { + "epoch": 0.5452466047176555, + "grad_norm": 0.4799044132232666, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0329, + "step": 19070 + }, + { + "epoch": 0.5455325232308792, + "grad_norm": 0.633895993232727, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0316, + "step": 19080 + }, + { + "epoch": 0.5458184417441029, + "grad_norm": 0.5601945519447327, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0449, + "step": 19090 + }, + { + "epoch": 0.5461043602573267, + "grad_norm": 0.4917007088661194, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0351, + "step": 19100 + }, + { + "epoch": 0.5463902787705504, + "grad_norm": 0.4813363254070282, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.029, + "step": 19110 + }, + { + "epoch": 0.5466761972837741, + "grad_norm": 0.5359676480293274, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0346, + "step": 19120 + }, + { + "epoch": 0.5469621157969978, + "grad_norm": 0.6500958204269409, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0374, + "step": 19130 + }, + { + "epoch": 0.5472480343102216, + "grad_norm": 0.7708510756492615, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0332, + "step": 19140 + }, + { + "epoch": 0.5475339528234453, + "grad_norm": 0.45693230628967285, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0344, + "step": 19150 + }, + { + "epoch": 0.5478198713366691, + "grad_norm": 0.6046226620674133, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0342, + "step": 19160 + }, + { + "epoch": 0.5481057898498928, + "grad_norm": 0.5253175497055054, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0449, + "step": 19170 + }, + { + "epoch": 0.5483917083631165, + "grad_norm": 0.3790060877799988, + "learning_rate": 7.507267205473318e-06, + "loss": 0.037, + "step": 19180 + }, + { + "epoch": 0.5486776268763403, + "grad_norm": 0.37709203362464905, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0346, + "step": 19190 + }, + { + "epoch": 0.5489635453895639, + "grad_norm": 0.3940931558609009, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0427, + "step": 19200 + }, + { + "epoch": 0.5492494639027877, + "grad_norm": 0.761299192905426, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0353, + "step": 19210 + }, + { + "epoch": 0.5495353824160114, + "grad_norm": 0.5268495082855225, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0328, + "step": 19220 + }, + { + "epoch": 0.5498213009292352, + "grad_norm": 0.45624151825904846, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0353, + "step": 19230 + }, + { + "epoch": 0.5501072194424589, + "grad_norm": 0.5374972224235535, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0345, + "step": 19240 + }, + { + "epoch": 0.5503931379556827, + "grad_norm": 0.49830907583236694, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0328, + "step": 19250 + }, + { + "epoch": 0.5506790564689064, + "grad_norm": 0.6223296523094177, + "learning_rate": 7.435514206212475e-06, + "loss": 0.037, + "step": 19260 + }, + { + "epoch": 0.55096497498213, + "grad_norm": 0.42801398038864136, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0371, + "step": 19270 + }, + { + "epoch": 0.5512508934953538, + "grad_norm": 0.3872825801372528, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0314, + "step": 19280 + }, + { + "epoch": 0.5515368120085775, + "grad_norm": 0.3967494070529938, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0349, + "step": 19290 + }, + { + "epoch": 0.5518227305218013, + "grad_norm": 0.42383769154548645, + "learning_rate": 7.399737764864619e-06, + "loss": 0.045, + "step": 19300 + }, + { + "epoch": 0.552108649035025, + "grad_norm": 0.48501884937286377, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0373, + "step": 19310 + }, + { + "epoch": 0.5523945675482488, + "grad_norm": 0.3783693015575409, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0334, + "step": 19320 + }, + { + "epoch": 0.5526804860614725, + "grad_norm": 0.5733019709587097, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0369, + "step": 19330 + }, + { + "epoch": 0.5529664045746963, + "grad_norm": 0.5022825002670288, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0375, + "step": 19340 + }, + { + "epoch": 0.5532523230879199, + "grad_norm": 0.5508015155792236, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0415, + "step": 19350 + }, + { + "epoch": 0.5535382416011436, + "grad_norm": 0.5692425966262817, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0401, + "step": 19360 + }, + { + "epoch": 0.5538241601143674, + "grad_norm": 0.7247840762138367, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0373, + "step": 19370 + }, + { + "epoch": 0.5541100786275911, + "grad_norm": 0.633986234664917, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0435, + "step": 19380 + }, + { + "epoch": 0.5543959971408149, + "grad_norm": 0.8598711490631104, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0424, + "step": 19390 + }, + { + "epoch": 0.5546819156540386, + "grad_norm": 0.782328188419342, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0424, + "step": 19400 + }, + { + "epoch": 0.5549678341672624, + "grad_norm": 0.48890456557273865, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0351, + "step": 19410 + }, + { + "epoch": 0.555253752680486, + "grad_norm": 0.4759981036186218, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0395, + "step": 19420 + }, + { + "epoch": 0.5555396711937098, + "grad_norm": 0.6431323885917664, + "learning_rate": 7.283934675167239e-06, + "loss": 0.036, + "step": 19430 + }, + { + "epoch": 0.5558255897069335, + "grad_norm": 0.6633809208869934, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0333, + "step": 19440 + }, + { + "epoch": 0.5561115082201572, + "grad_norm": 0.3405994772911072, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0375, + "step": 19450 + }, + { + "epoch": 0.556397426733381, + "grad_norm": 0.3443987965583801, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0329, + "step": 19460 + }, + { + "epoch": 0.5566833452466047, + "grad_norm": 0.7973398566246033, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0412, + "step": 19470 + }, + { + "epoch": 0.5569692637598285, + "grad_norm": 0.43843239545822144, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0302, + "step": 19480 + }, + { + "epoch": 0.5572551822730522, + "grad_norm": 0.6797782182693481, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0401, + "step": 19490 + }, + { + "epoch": 0.557541100786276, + "grad_norm": 0.5020610690116882, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0329, + "step": 19500 + }, + { + "epoch": 0.5578270192994996, + "grad_norm": 0.5093050003051758, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0381, + "step": 19510 + }, + { + "epoch": 0.5581129378127234, + "grad_norm": 0.6136947870254517, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0373, + "step": 19520 + }, + { + "epoch": 0.5583988563259471, + "grad_norm": 0.4213317930698395, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0338, + "step": 19530 + }, + { + "epoch": 0.5586847748391708, + "grad_norm": 0.6560636162757874, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0332, + "step": 19540 + }, + { + "epoch": 0.5589706933523946, + "grad_norm": 0.41303765773773193, + "learning_rate": 7.177693135871202e-06, + "loss": 0.03, + "step": 19550 + }, + { + "epoch": 0.5592566118656183, + "grad_norm": 0.5260538458824158, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0328, + "step": 19560 + }, + { + "epoch": 0.559542530378842, + "grad_norm": 0.6076327562332153, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0428, + "step": 19570 + }, + { + "epoch": 0.5598284488920657, + "grad_norm": 0.635111927986145, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0318, + "step": 19580 + }, + { + "epoch": 0.5601143674052895, + "grad_norm": 0.7933056354522705, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0357, + "step": 19590 + }, + { + "epoch": 0.5604002859185132, + "grad_norm": 0.44312241673469543, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0289, + "step": 19600 + }, + { + "epoch": 0.560686204431737, + "grad_norm": 0.36346134543418884, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0354, + "step": 19610 + }, + { + "epoch": 0.5609721229449607, + "grad_norm": 0.49605289101600647, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0367, + "step": 19620 + }, + { + "epoch": 0.5612580414581844, + "grad_norm": 0.7115452289581299, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0354, + "step": 19630 + }, + { + "epoch": 0.5615439599714082, + "grad_norm": 0.650925874710083, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0353, + "step": 19640 + }, + { + "epoch": 0.5618298784846318, + "grad_norm": 0.5046663880348206, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0294, + "step": 19650 + }, + { + "epoch": 0.5621157969978556, + "grad_norm": 0.4441855549812317, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0326, + "step": 19660 + }, + { + "epoch": 0.5624017155110793, + "grad_norm": 0.3956650495529175, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0446, + "step": 19670 + }, + { + "epoch": 0.5626876340243031, + "grad_norm": 0.5384211540222168, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0331, + "step": 19680 + }, + { + "epoch": 0.5629735525375268, + "grad_norm": 0.6183366775512695, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0324, + "step": 19690 + }, + { + "epoch": 0.5632594710507506, + "grad_norm": 0.9116242527961731, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0341, + "step": 19700 + }, + { + "epoch": 0.5635453895639743, + "grad_norm": 0.8171015381813049, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0306, + "step": 19710 + }, + { + "epoch": 0.563831308077198, + "grad_norm": 0.42670243978500366, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0336, + "step": 19720 + }, + { + "epoch": 0.5641172265904217, + "grad_norm": 0.7338811159133911, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0363, + "step": 19730 + }, + { + "epoch": 0.5644031451036454, + "grad_norm": 0.5576338171958923, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0371, + "step": 19740 + }, + { + "epoch": 0.5646890636168692, + "grad_norm": 0.7390629649162292, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0382, + "step": 19750 + }, + { + "epoch": 0.5649749821300929, + "grad_norm": 0.801812469959259, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0379, + "step": 19760 + }, + { + "epoch": 0.5652609006433167, + "grad_norm": 0.5697385668754578, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0369, + "step": 19770 + }, + { + "epoch": 0.5655468191565404, + "grad_norm": 0.4180932343006134, + "learning_rate": 6.975884226362e-06, + "loss": 0.039, + "step": 19780 + }, + { + "epoch": 0.5658327376697642, + "grad_norm": 0.648389995098114, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0346, + "step": 19790 + }, + { + "epoch": 0.5661186561829878, + "grad_norm": 0.9673929214477539, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0392, + "step": 19800 + }, + { + "epoch": 0.5664045746962115, + "grad_norm": 0.4793975353240967, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0321, + "step": 19810 + }, + { + "epoch": 0.5666904932094353, + "grad_norm": 0.5206098556518555, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0319, + "step": 19820 + }, + { + "epoch": 0.566976411722659, + "grad_norm": 0.39929306507110596, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0335, + "step": 19830 + }, + { + "epoch": 0.5672623302358828, + "grad_norm": 0.6819440722465515, + "learning_rate": 6.923644220932124e-06, + "loss": 0.0338, + "step": 19840 + }, + { + "epoch": 0.5675482487491065, + "grad_norm": 0.7612042427062988, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0345, + "step": 19850 + }, + { + "epoch": 0.5678341672623303, + "grad_norm": 0.472676545381546, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0388, + "step": 19860 + }, + { + "epoch": 0.568120085775554, + "grad_norm": 0.48102107644081116, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0304, + "step": 19870 + }, + { + "epoch": 0.5684060042887777, + "grad_norm": 0.4174644649028778, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0315, + "step": 19880 + }, + { + "epoch": 0.5686919228020014, + "grad_norm": 0.4218151271343231, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0413, + "step": 19890 + }, + { + "epoch": 0.5689778413152251, + "grad_norm": 0.8243978023529053, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0399, + "step": 19900 + }, + { + "epoch": 0.5692637598284489, + "grad_norm": 0.400924414396286, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0298, + "step": 19910 + }, + { + "epoch": 0.5695496783416726, + "grad_norm": 0.5199277400970459, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0351, + "step": 19920 + }, + { + "epoch": 0.5698355968548964, + "grad_norm": 0.5238781571388245, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0374, + "step": 19930 + }, + { + "epoch": 0.5701215153681201, + "grad_norm": 0.7451756596565247, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0378, + "step": 19940 + }, + { + "epoch": 0.5704074338813439, + "grad_norm": 0.5029926300048828, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0391, + "step": 19950 + }, + { + "epoch": 0.5706933523945675, + "grad_norm": 0.5532147884368896, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0383, + "step": 19960 + }, + { + "epoch": 0.5709792709077913, + "grad_norm": 0.5694131851196289, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0364, + "step": 19970 + }, + { + "epoch": 0.571265189421015, + "grad_norm": 0.5066515803337097, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0363, + "step": 19980 + }, + { + "epoch": 0.5715511079342387, + "grad_norm": 0.5676470398902893, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0327, + "step": 19990 + }, + { + "epoch": 0.5718370264474625, + "grad_norm": 0.37414318323135376, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0395, + "step": 20000 + }, + { + "epoch": 0.5721229449606862, + "grad_norm": 0.5888793468475342, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0372, + "step": 20010 + }, + { + "epoch": 0.57240886347391, + "grad_norm": 0.6593262553215027, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0329, + "step": 20020 + }, + { + "epoch": 0.5726947819871336, + "grad_norm": 0.6382879614830017, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0286, + "step": 20030 + }, + { + "epoch": 0.5729807005003574, + "grad_norm": 0.6364927887916565, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0383, + "step": 20040 + }, + { + "epoch": 0.5732666190135811, + "grad_norm": 0.4102194011211395, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0342, + "step": 20050 + }, + { + "epoch": 0.5735525375268049, + "grad_norm": 0.6449235081672668, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0315, + "step": 20060 + }, + { + "epoch": 0.5738384560400286, + "grad_norm": 0.708431601524353, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0316, + "step": 20070 + }, + { + "epoch": 0.5741243745532523, + "grad_norm": 0.46444272994995117, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0352, + "step": 20080 + }, + { + "epoch": 0.5744102930664761, + "grad_norm": 0.7026715278625488, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0337, + "step": 20090 + }, + { + "epoch": 0.5746962115796997, + "grad_norm": 0.43397894501686096, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0303, + "step": 20100 + }, + { + "epoch": 0.5749821300929235, + "grad_norm": 0.4937734305858612, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0403, + "step": 20110 + }, + { + "epoch": 0.5752680486061472, + "grad_norm": 0.5981410145759583, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0375, + "step": 20120 + }, + { + "epoch": 0.575553967119371, + "grad_norm": 0.5616198778152466, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0314, + "step": 20130 + }, + { + "epoch": 0.5758398856325947, + "grad_norm": 0.35028502345085144, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0367, + "step": 20140 + }, + { + "epoch": 0.5761258041458185, + "grad_norm": 0.3556109666824341, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0308, + "step": 20150 + }, + { + "epoch": 0.5764117226590422, + "grad_norm": 0.579409658908844, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0344, + "step": 20160 + }, + { + "epoch": 0.5766976411722659, + "grad_norm": 0.4484683573246002, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0312, + "step": 20170 + }, + { + "epoch": 0.5769835596854896, + "grad_norm": 0.3636038899421692, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0337, + "step": 20180 + }, + { + "epoch": 0.5772694781987133, + "grad_norm": 0.6667287349700928, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0343, + "step": 20190 + }, + { + "epoch": 0.5775553967119371, + "grad_norm": 0.26031574606895447, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0303, + "step": 20200 + }, + { + "epoch": 0.5778413152251608, + "grad_norm": 0.6683355569839478, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0316, + "step": 20210 + }, + { + "epoch": 0.5781272337383846, + "grad_norm": 0.4097786843776703, + "learning_rate": 6.596880604028027e-06, + "loss": 0.0346, + "step": 20220 + }, + { + "epoch": 0.5784131522516083, + "grad_norm": 0.45405757427215576, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0345, + "step": 20230 + }, + { + "epoch": 0.5786990707648321, + "grad_norm": 0.28291839361190796, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0323, + "step": 20240 + }, + { + "epoch": 0.5789849892780558, + "grad_norm": 0.5656186938285828, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0299, + "step": 20250 + }, + { + "epoch": 0.5792709077912794, + "grad_norm": 0.6780310869216919, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0309, + "step": 20260 + }, + { + "epoch": 0.5795568263045032, + "grad_norm": 0.3968813121318817, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0347, + "step": 20270 + }, + { + "epoch": 0.5798427448177269, + "grad_norm": 0.6598440408706665, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0329, + "step": 20280 + }, + { + "epoch": 0.5801286633309507, + "grad_norm": 0.4988970458507538, + "learning_rate": 6.53748481975927e-06, + "loss": 0.038, + "step": 20290 + }, + { + "epoch": 0.5804145818441744, + "grad_norm": 0.8016706705093384, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0358, + "step": 20300 + }, + { + "epoch": 0.5807005003573982, + "grad_norm": 0.8367684483528137, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0354, + "step": 20310 + }, + { + "epoch": 0.5809864188706219, + "grad_norm": 0.5730129480361938, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0421, + "step": 20320 + }, + { + "epoch": 0.5812723373838456, + "grad_norm": 0.43631577491760254, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0361, + "step": 20330 + }, + { + "epoch": 0.5815582558970693, + "grad_norm": 0.7001264691352844, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0355, + "step": 20340 + }, + { + "epoch": 0.581844174410293, + "grad_norm": 0.4988951086997986, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0387, + "step": 20350 + }, + { + "epoch": 0.5821300929235168, + "grad_norm": 0.45731016993522644, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0398, + "step": 20360 + }, + { + "epoch": 0.5824160114367405, + "grad_norm": 0.38684406876564026, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0345, + "step": 20370 + }, + { + "epoch": 0.5827019299499643, + "grad_norm": 0.3924580514431, + "learning_rate": 6.461496350649529e-06, + "loss": 0.037, + "step": 20380 + }, + { + "epoch": 0.582987848463188, + "grad_norm": 0.43735265731811523, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0371, + "step": 20390 + }, + { + "epoch": 0.5832737669764118, + "grad_norm": 0.4595138430595398, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0337, + "step": 20400 + }, + { + "epoch": 0.5835596854896354, + "grad_norm": 0.429569810628891, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0284, + "step": 20410 + }, + { + "epoch": 0.5838456040028592, + "grad_norm": 0.5399166345596313, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0353, + "step": 20420 + }, + { + "epoch": 0.5841315225160829, + "grad_norm": 0.5698734521865845, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0361, + "step": 20430 + }, + { + "epoch": 0.5844174410293066, + "grad_norm": 0.35422587394714355, + "learning_rate": 6.411076603575166e-06, + "loss": 0.033, + "step": 20440 + }, + { + "epoch": 0.5847033595425304, + "grad_norm": 0.4475875198841095, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0344, + "step": 20450 + }, + { + "epoch": 0.5849892780557541, + "grad_norm": 0.4950159192085266, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0428, + "step": 20460 + }, + { + "epoch": 0.5852751965689779, + "grad_norm": 0.695249617099762, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0354, + "step": 20470 + }, + { + "epoch": 0.5855611150822015, + "grad_norm": 0.2538593113422394, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0383, + "step": 20480 + }, + { + "epoch": 0.5858470335954253, + "grad_norm": 0.6770910024642944, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0364, + "step": 20490 + }, + { + "epoch": 0.586132952108649, + "grad_norm": 0.7187057733535767, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0319, + "step": 20500 + }, + { + "epoch": 0.5864188706218728, + "grad_norm": 0.34853193163871765, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.033, + "step": 20510 + }, + { + "epoch": 0.5867047891350965, + "grad_norm": 0.8484768271446228, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0331, + "step": 20520 + }, + { + "epoch": 0.5869907076483202, + "grad_norm": 0.6645244359970093, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0353, + "step": 20530 + }, + { + "epoch": 0.587276626161544, + "grad_norm": 0.5094996690750122, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0374, + "step": 20540 + }, + { + "epoch": 0.5875625446747677, + "grad_norm": 0.5012859106063843, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0329, + "step": 20550 + }, + { + "epoch": 0.5878484631879914, + "grad_norm": 0.6465861797332764, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0282, + "step": 20560 + }, + { + "epoch": 0.5881343817012151, + "grad_norm": 0.5694834589958191, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0313, + "step": 20570 + }, + { + "epoch": 0.5884203002144389, + "grad_norm": 0.4945555627346039, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0353, + "step": 20580 + }, + { + "epoch": 0.5887062187276626, + "grad_norm": 0.5606586933135986, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0343, + "step": 20590 + }, + { + "epoch": 0.5889921372408864, + "grad_norm": 0.6913802027702332, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0358, + "step": 20600 + }, + { + "epoch": 0.5892780557541101, + "grad_norm": 0.8119901418685913, + "learning_rate": 6.269280523549298e-06, + "loss": 0.038, + "step": 20610 + }, + { + "epoch": 0.5895639742673338, + "grad_norm": 0.5558752417564392, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0311, + "step": 20620 + }, + { + "epoch": 0.5898498927805575, + "grad_norm": 0.45028987526893616, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0321, + "step": 20630 + }, + { + "epoch": 0.5901358112937812, + "grad_norm": 0.3697125017642975, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0331, + "step": 20640 + }, + { + "epoch": 0.590421729807005, + "grad_norm": 0.5406038761138916, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0445, + "step": 20650 + }, + { + "epoch": 0.5907076483202287, + "grad_norm": 0.4301048219203949, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0371, + "step": 20660 + }, + { + "epoch": 0.5909935668334525, + "grad_norm": 0.6343403458595276, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0353, + "step": 20670 + }, + { + "epoch": 0.5912794853466762, + "grad_norm": 0.4666310250759125, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0352, + "step": 20680 + }, + { + "epoch": 0.5915654038599, + "grad_norm": 0.7471063733100891, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0352, + "step": 20690 + }, + { + "epoch": 0.5918513223731237, + "grad_norm": 0.9971692562103271, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0327, + "step": 20700 + }, + { + "epoch": 0.5921372408863473, + "grad_norm": 0.5646237134933472, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0365, + "step": 20710 + }, + { + "epoch": 0.5924231593995711, + "grad_norm": 0.46781328320503235, + "learning_rate": 6.17838207381795e-06, + "loss": 0.042, + "step": 20720 + }, + { + "epoch": 0.5927090779127948, + "grad_norm": 0.7061547040939331, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0484, + "step": 20730 + }, + { + "epoch": 0.5929949964260186, + "grad_norm": 0.6651175618171692, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0353, + "step": 20740 + }, + { + "epoch": 0.5932809149392423, + "grad_norm": 0.5959596037864685, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0344, + "step": 20750 + }, + { + "epoch": 0.5935668334524661, + "grad_norm": 0.5869056582450867, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0389, + "step": 20760 + }, + { + "epoch": 0.5938527519656898, + "grad_norm": 0.42101356387138367, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0288, + "step": 20770 + }, + { + "epoch": 0.5941386704789136, + "grad_norm": 0.6310023069381714, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0362, + "step": 20780 + }, + { + "epoch": 0.5944245889921372, + "grad_norm": 0.6737013459205627, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0377, + "step": 20790 + }, + { + "epoch": 0.5947105075053609, + "grad_norm": 0.6716046333312988, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0415, + "step": 20800 + }, + { + "epoch": 0.5949964260185847, + "grad_norm": 0.9742669463157654, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0337, + "step": 20810 + }, + { + "epoch": 0.5952823445318084, + "grad_norm": 0.571782648563385, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0362, + "step": 20820 + }, + { + "epoch": 0.5955682630450322, + "grad_norm": 0.9673911333084106, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0362, + "step": 20830 + }, + { + "epoch": 0.5958541815582559, + "grad_norm": 0.5391695499420166, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0331, + "step": 20840 + }, + { + "epoch": 0.5961401000714797, + "grad_norm": 1.4766349792480469, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0332, + "step": 20850 + }, + { + "epoch": 0.5964260185847033, + "grad_norm": 0.6329004168510437, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0375, + "step": 20860 + }, + { + "epoch": 0.5967119370979271, + "grad_norm": 0.6745501160621643, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0347, + "step": 20870 + }, + { + "epoch": 0.5969978556111508, + "grad_norm": 0.3006536364555359, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0321, + "step": 20880 + }, + { + "epoch": 0.5972837741243745, + "grad_norm": 0.4666125476360321, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0363, + "step": 20890 + }, + { + "epoch": 0.5975696926375983, + "grad_norm": 0.3881456255912781, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0318, + "step": 20900 + }, + { + "epoch": 0.597855611150822, + "grad_norm": 0.4211449921131134, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0357, + "step": 20910 + }, + { + "epoch": 0.5981415296640458, + "grad_norm": 1.125683307647705, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0364, + "step": 20920 + }, + { + "epoch": 0.5984274481772694, + "grad_norm": 0.9670853614807129, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0385, + "step": 20930 + }, + { + "epoch": 0.5987133666904932, + "grad_norm": 0.7302138209342957, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0321, + "step": 20940 + }, + { + "epoch": 0.5989992852037169, + "grad_norm": 0.7883613109588623, + "learning_rate": 5.990549152010853e-06, + "loss": 0.038, + "step": 20950 + }, + { + "epoch": 0.5992852037169407, + "grad_norm": 0.44051188230514526, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0423, + "step": 20960 + }, + { + "epoch": 0.5995711222301644, + "grad_norm": 0.5225116014480591, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0293, + "step": 20970 + }, + { + "epoch": 0.5998570407433881, + "grad_norm": 0.44672495126724243, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0314, + "step": 20980 + }, + { + "epoch": 0.6001429592566119, + "grad_norm": 0.4489240050315857, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0298, + "step": 20990 + }, + { + "epoch": 0.6004288777698356, + "grad_norm": 0.3942757844924927, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0323, + "step": 21000 + }, + { + "epoch": 0.6007147962830593, + "grad_norm": 0.5079668760299683, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0435, + "step": 21010 + }, + { + "epoch": 0.601000714796283, + "grad_norm": 0.5057359933853149, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0364, + "step": 21020 + }, + { + "epoch": 0.6012866333095068, + "grad_norm": 0.4823545515537262, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0408, + "step": 21030 + }, + { + "epoch": 0.6015725518227305, + "grad_norm": 0.42647498846054077, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0366, + "step": 21040 + }, + { + "epoch": 0.6018584703359543, + "grad_norm": 0.5967830419540405, + "learning_rate": 5.909845843697164e-06, + "loss": 0.037, + "step": 21050 + }, + { + "epoch": 0.602144388849178, + "grad_norm": 0.4567292034626007, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0306, + "step": 21060 + }, + { + "epoch": 0.6024303073624017, + "grad_norm": 0.6767273545265198, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0447, + "step": 21070 + }, + { + "epoch": 0.6027162258756255, + "grad_norm": 0.2957002520561218, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0339, + "step": 21080 + }, + { + "epoch": 0.6030021443888491, + "grad_norm": 0.6870969533920288, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0313, + "step": 21090 + }, + { + "epoch": 0.6032880629020729, + "grad_norm": 0.530910313129425, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0377, + "step": 21100 + }, + { + "epoch": 0.6035739814152966, + "grad_norm": 0.21370625495910645, + "learning_rate": 5.86170998451151e-06, + "loss": 0.032, + "step": 21110 + }, + { + "epoch": 0.6038598999285204, + "grad_norm": 0.6039503812789917, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0258, + "step": 21120 + }, + { + "epoch": 0.6041458184417441, + "grad_norm": 0.5375682711601257, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0341, + "step": 21130 + }, + { + "epoch": 0.6044317369549679, + "grad_norm": 0.4819096326828003, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0309, + "step": 21140 + }, + { + "epoch": 0.6047176554681916, + "grad_norm": 0.31165415048599243, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0278, + "step": 21150 + }, + { + "epoch": 0.6050035739814152, + "grad_norm": 0.2781001925468445, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0342, + "step": 21160 + }, + { + "epoch": 0.605289492494639, + "grad_norm": 0.44726037979125977, + "learning_rate": 5.813791207086085e-06, + "loss": 0.032, + "step": 21170 + }, + { + "epoch": 0.6055754110078627, + "grad_norm": 0.5762766599655151, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0325, + "step": 21180 + }, + { + "epoch": 0.6058613295210865, + "grad_norm": 0.49829939007759094, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0322, + "step": 21190 + }, + { + "epoch": 0.6061472480343102, + "grad_norm": 0.4683297276496887, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0369, + "step": 21200 + }, + { + "epoch": 0.606433166547534, + "grad_norm": 0.662159264087677, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0278, + "step": 21210 + }, + { + "epoch": 0.6067190850607577, + "grad_norm": 0.4397001564502716, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0366, + "step": 21220 + }, + { + "epoch": 0.6070050035739815, + "grad_norm": 0.4977007508277893, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0293, + "step": 21230 + }, + { + "epoch": 0.6072909220872051, + "grad_norm": 0.3705490827560425, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0315, + "step": 21240 + }, + { + "epoch": 0.6075768406004288, + "grad_norm": 0.6350240111351013, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0286, + "step": 21250 + }, + { + "epoch": 0.6078627591136526, + "grad_norm": 0.5590423941612244, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0373, + "step": 21260 + }, + { + "epoch": 0.6081486776268763, + "grad_norm": 0.5244049429893494, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0325, + "step": 21270 + }, + { + "epoch": 0.6084345961401001, + "grad_norm": 1.082044005393982, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0373, + "step": 21280 + }, + { + "epoch": 0.6087205146533238, + "grad_norm": 0.614028811454773, + "learning_rate": 5.71861298612245e-06, + "loss": 0.031, + "step": 21290 + }, + { + "epoch": 0.6090064331665476, + "grad_norm": 0.783205509185791, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0289, + "step": 21300 + }, + { + "epoch": 0.6092923516797712, + "grad_norm": 0.5420807600021362, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.031, + "step": 21310 + }, + { + "epoch": 0.609578270192995, + "grad_norm": 0.42979222536087036, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0291, + "step": 21320 + }, + { + "epoch": 0.6098641887062187, + "grad_norm": 0.44511356949806213, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.031, + "step": 21330 + }, + { + "epoch": 0.6101501072194424, + "grad_norm": 0.528799831867218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0269, + "step": 21340 + }, + { + "epoch": 0.6104360257326662, + "grad_norm": 0.43274471163749695, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0438, + "step": 21350 + }, + { + "epoch": 0.6107219442458899, + "grad_norm": 0.8020172715187073, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0393, + "step": 21360 + }, + { + "epoch": 0.6110078627591137, + "grad_norm": 0.4354296028614044, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0338, + "step": 21370 + }, + { + "epoch": 0.6112937812723374, + "grad_norm": 0.587364673614502, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0359, + "step": 21380 + }, + { + "epoch": 0.6115796997855611, + "grad_norm": 0.5426310300827026, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0333, + "step": 21390 + }, + { + "epoch": 0.6118656182987848, + "grad_norm": 0.5900459289550781, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0344, + "step": 21400 + }, + { + "epoch": 0.6121515368120086, + "grad_norm": 0.5652357935905457, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0396, + "step": 21410 + }, + { + "epoch": 0.6124374553252323, + "grad_norm": 0.5287114977836609, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0387, + "step": 21420 + }, + { + "epoch": 0.612723373838456, + "grad_norm": 0.7939184904098511, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0351, + "step": 21430 + }, + { + "epoch": 0.6130092923516798, + "grad_norm": 0.6840642094612122, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0349, + "step": 21440 + }, + { + "epoch": 0.6132952108649035, + "grad_norm": 0.3717428147792816, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0336, + "step": 21450 + }, + { + "epoch": 0.6135811293781273, + "grad_norm": 0.5073713064193726, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0326, + "step": 21460 + }, + { + "epoch": 0.6138670478913509, + "grad_norm": 1.1579232215881348, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0388, + "step": 21470 + }, + { + "epoch": 0.6141529664045747, + "grad_norm": 0.4209369122982025, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0307, + "step": 21480 + }, + { + "epoch": 0.6144388849177984, + "grad_norm": 0.38663822412490845, + "learning_rate": 5.561973825289734e-06, + "loss": 0.037, + "step": 21490 + }, + { + "epoch": 0.6147248034310222, + "grad_norm": 0.538270890712738, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0333, + "step": 21500 + }, + { + "epoch": 0.6150107219442459, + "grad_norm": 0.28280535340309143, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0319, + "step": 21510 + }, + { + "epoch": 0.6152966404574696, + "grad_norm": 0.5407803058624268, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0352, + "step": 21520 + }, + { + "epoch": 0.6155825589706934, + "grad_norm": 1.4600974321365356, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0409, + "step": 21530 + }, + { + "epoch": 0.615868477483917, + "grad_norm": 0.659900426864624, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0322, + "step": 21540 + }, + { + "epoch": 0.6161543959971408, + "grad_norm": 0.6401934623718262, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0339, + "step": 21550 + }, + { + "epoch": 0.6164403145103645, + "grad_norm": 0.6409866213798523, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0331, + "step": 21560 + }, + { + "epoch": 0.6167262330235883, + "grad_norm": 0.6627630591392517, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0332, + "step": 21570 + }, + { + "epoch": 0.617012151536812, + "grad_norm": 0.6180721521377563, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0327, + "step": 21580 + }, + { + "epoch": 0.6172980700500358, + "grad_norm": 0.4689866006374359, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0276, + "step": 21590 + }, + { + "epoch": 0.6175839885632595, + "grad_norm": 0.5039265751838684, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0427, + "step": 21600 + }, + { + "epoch": 0.6178699070764831, + "grad_norm": 0.5313833355903625, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0351, + "step": 21610 + }, + { + "epoch": 0.6181558255897069, + "grad_norm": 0.4919044077396393, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0327, + "step": 21620 + }, + { + "epoch": 0.6184417441029306, + "grad_norm": 0.5446444153785706, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0331, + "step": 21630 + }, + { + "epoch": 0.6187276626161544, + "grad_norm": 0.5198109745979309, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.032, + "step": 21640 + }, + { + "epoch": 0.6190135811293781, + "grad_norm": 0.5684625506401062, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0339, + "step": 21650 + }, + { + "epoch": 0.6192994996426019, + "grad_norm": 0.6882810592651367, + "learning_rate": 5.430834687545416e-06, + "loss": 0.035, + "step": 21660 + }, + { + "epoch": 0.6195854181558256, + "grad_norm": 0.7360101938247681, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0306, + "step": 21670 + }, + { + "epoch": 0.6198713366690494, + "grad_norm": 0.5557180047035217, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0241, + "step": 21680 + }, + { + "epoch": 0.620157255182273, + "grad_norm": 0.4302096962928772, + "learning_rate": 5.407887295494495e-06, + "loss": 0.035, + "step": 21690 + }, + { + "epoch": 0.6204431736954967, + "grad_norm": 0.4740016460418701, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0331, + "step": 21700 + }, + { + "epoch": 0.6207290922087205, + "grad_norm": 0.5400598049163818, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0297, + "step": 21710 + }, + { + "epoch": 0.6210150107219442, + "grad_norm": 0.4270641803741455, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0334, + "step": 21720 + }, + { + "epoch": 0.621300929235168, + "grad_norm": 0.41063550114631653, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0375, + "step": 21730 + }, + { + "epoch": 0.6215868477483917, + "grad_norm": 0.48556044697761536, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0291, + "step": 21740 + }, + { + "epoch": 0.6218727662616155, + "grad_norm": 0.2872731387615204, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0323, + "step": 21750 + }, + { + "epoch": 0.6221586847748392, + "grad_norm": 0.4088454246520996, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0307, + "step": 21760 + }, + { + "epoch": 0.622444603288063, + "grad_norm": 0.42600440979003906, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0326, + "step": 21770 + }, + { + "epoch": 0.6227305218012866, + "grad_norm": 0.36466315388679504, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0337, + "step": 21780 + }, + { + "epoch": 0.6230164403145103, + "grad_norm": 0.588921308517456, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0336, + "step": 21790 + }, + { + "epoch": 0.6233023588277341, + "grad_norm": 0.44768571853637695, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0326, + "step": 21800 + }, + { + "epoch": 0.6235882773409578, + "grad_norm": 1.1612637042999268, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0355, + "step": 21810 + }, + { + "epoch": 0.6238741958541816, + "grad_norm": 1.0912114381790161, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0436, + "step": 21820 + }, + { + "epoch": 0.6241601143674053, + "grad_norm": 0.5813164710998535, + "learning_rate": 5.301584321328435e-06, + "loss": 0.034, + "step": 21830 + }, + { + "epoch": 0.624446032880629, + "grad_norm": 0.45064911246299744, + "learning_rate": 5.294041118587667e-06, + "loss": 0.032, + "step": 21840 + }, + { + "epoch": 0.6247319513938527, + "grad_norm": 0.5173943638801575, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0322, + "step": 21850 + }, + { + "epoch": 0.6250178699070765, + "grad_norm": 0.41157352924346924, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0319, + "step": 21860 + }, + { + "epoch": 0.6253037884203002, + "grad_norm": 0.5711286067962646, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0288, + "step": 21870 + }, + { + "epoch": 0.6255897069335239, + "grad_norm": 0.5108116865158081, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0325, + "step": 21880 + }, + { + "epoch": 0.6258756254467477, + "grad_norm": 0.49562424421310425, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0306, + "step": 21890 + }, + { + "epoch": 0.6261615439599714, + "grad_norm": 0.3392108976840973, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0358, + "step": 21900 + }, + { + "epoch": 0.6264474624731952, + "grad_norm": 1.0588114261627197, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0404, + "step": 21910 + }, + { + "epoch": 0.6267333809864188, + "grad_norm": 0.6979959607124329, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0349, + "step": 21920 + }, + { + "epoch": 0.6270192994996426, + "grad_norm": 0.3185918927192688, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0276, + "step": 21930 + }, + { + "epoch": 0.6273052180128663, + "grad_norm": 0.3921501338481903, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0326, + "step": 21940 + }, + { + "epoch": 0.6275911365260901, + "grad_norm": 0.9666212797164917, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0346, + "step": 21950 + }, + { + "epoch": 0.6278770550393138, + "grad_norm": 0.4483211040496826, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0306, + "step": 21960 + }, + { + "epoch": 0.6281629735525375, + "grad_norm": 0.4839077293872833, + "learning_rate": 5.196592054173714e-06, + "loss": 0.026, + "step": 21970 + }, + { + "epoch": 0.6284488920657613, + "grad_norm": 0.5054528117179871, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0299, + "step": 21980 + }, + { + "epoch": 0.628734810578985, + "grad_norm": 0.5953076481819153, + "learning_rate": 5.181701567303612e-06, + "loss": 0.036, + "step": 21990 + }, + { + "epoch": 0.6290207290922087, + "grad_norm": 0.39300060272216797, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0358, + "step": 22000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.374205589323776e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6e889884afa439b94ced30d3b053d2bd661b6529 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06894f9c14344861990c4c3e6e2d2981a600e2593dbab841ada511532e1fa6e8 +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cdf7454f21e0ebe7bef11b7793629d8ac104d19c --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c285ed600a59a9fad8b89b5ee35d7e033805c39340fe94efd355d8b9646a71c9 +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a8d827826639ccf77a3c036ca77e0072ca2da4c4 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5c8bbbdbe57ff418b7a7af401aef2c165b4e6afa1d2188a515ffda7f9e92771 +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..74ac81c58834f9f67b33441bfd8e89a01f8c6878 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json @@ -0,0 +1,16834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.686204431736955, + "eval_steps": 500, + "global_step": 24000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + }, + { + "epoch": 0.40057183702644744, + "grad_norm": 0.9077253937721252, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0415, + "step": 14010 + }, + { + "epoch": 0.4008577555396712, + "grad_norm": 0.9126781225204468, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.048, + "step": 14020 + }, + { + "epoch": 0.4011436740528949, + "grad_norm": 0.6264623999595642, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0411, + "step": 14030 + }, + { + "epoch": 0.40142959256611865, + "grad_norm": 0.523853600025177, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.051, + "step": 14040 + }, + { + "epoch": 0.4017155110793424, + "grad_norm": 0.6340035200119019, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0426, + "step": 14050 + }, + { + "epoch": 0.40200142959256613, + "grad_norm": 0.3594725430011749, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0397, + "step": 14060 + }, + { + "epoch": 0.40228734810578987, + "grad_norm": 0.941470742225647, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0402, + "step": 14070 + }, + { + "epoch": 0.4025732666190136, + "grad_norm": 0.840506911277771, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0473, + "step": 14080 + }, + { + "epoch": 0.4028591851322373, + "grad_norm": 0.3359200954437256, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0405, + "step": 14090 + }, + { + "epoch": 0.403145103645461, + "grad_norm": 0.49658629298210144, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0464, + "step": 14100 + }, + { + "epoch": 0.40343102215868476, + "grad_norm": 0.7940187454223633, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0417, + "step": 14110 + }, + { + "epoch": 0.4037169406719085, + "grad_norm": 0.30110660195350647, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0371, + "step": 14120 + }, + { + "epoch": 0.40400285918513223, + "grad_norm": 0.42845240235328674, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.053, + "step": 14130 + }, + { + "epoch": 0.40428877769835597, + "grad_norm": 0.997348427772522, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.041, + "step": 14140 + }, + { + "epoch": 0.4045746962115797, + "grad_norm": 0.4759966731071472, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0377, + "step": 14150 + }, + { + "epoch": 0.40486061472480345, + "grad_norm": 0.42045602202415466, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0397, + "step": 14160 + }, + { + "epoch": 0.4051465332380272, + "grad_norm": 0.6400002837181091, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0507, + "step": 14170 + }, + { + "epoch": 0.40543245175125087, + "grad_norm": 0.5473673939704895, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0359, + "step": 14180 + }, + { + "epoch": 0.4057183702644746, + "grad_norm": 0.7414730787277222, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0416, + "step": 14190 + }, + { + "epoch": 0.40600428877769834, + "grad_norm": 0.4691861867904663, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0363, + "step": 14200 + }, + { + "epoch": 0.4062902072909221, + "grad_norm": 0.9186112880706787, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0445, + "step": 14210 + }, + { + "epoch": 0.4065761258041458, + "grad_norm": 0.6782190203666687, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.40686204431736955, + "grad_norm": 0.6948013305664062, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.037, + "step": 14230 + }, + { + "epoch": 0.4071479628305933, + "grad_norm": 0.3034680485725403, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0371, + "step": 14240 + }, + { + "epoch": 0.40743388134381703, + "grad_norm": 0.4254174828529358, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0449, + "step": 14250 + }, + { + "epoch": 0.40771979985704077, + "grad_norm": 1.3622064590454102, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0428, + "step": 14260 + }, + { + "epoch": 0.40800571837026445, + "grad_norm": 0.5928359031677246, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0424, + "step": 14270 + }, + { + "epoch": 0.4082916368834882, + "grad_norm": 0.9103132486343384, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0414, + "step": 14280 + }, + { + "epoch": 0.4085775553967119, + "grad_norm": 0.6338028311729431, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0376, + "step": 14290 + }, + { + "epoch": 0.40886347390993566, + "grad_norm": 0.9920284748077393, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0393, + "step": 14300 + }, + { + "epoch": 0.4091493924231594, + "grad_norm": 0.411830335855484, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0336, + "step": 14310 + }, + { + "epoch": 0.40943531093638313, + "grad_norm": 0.6977682709693909, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0454, + "step": 14320 + }, + { + "epoch": 0.40972122944960687, + "grad_norm": 0.6303663849830627, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0453, + "step": 14330 + }, + { + "epoch": 0.4100071479628306, + "grad_norm": 0.3048207759857178, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0373, + "step": 14340 + }, + { + "epoch": 0.41029306647605435, + "grad_norm": 0.7683395743370056, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0438, + "step": 14350 + }, + { + "epoch": 0.41057898498927803, + "grad_norm": 0.5791511535644531, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0392, + "step": 14360 + }, + { + "epoch": 0.41086490350250177, + "grad_norm": 0.876626193523407, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0324, + "step": 14370 + }, + { + "epoch": 0.4111508220157255, + "grad_norm": 0.5971815586090088, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0368, + "step": 14380 + }, + { + "epoch": 0.41143674052894924, + "grad_norm": 0.6508862376213074, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0411, + "step": 14390 + }, + { + "epoch": 0.411722659042173, + "grad_norm": 0.4704359471797943, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0351, + "step": 14400 + }, + { + "epoch": 0.4120085775553967, + "grad_norm": 0.4266453683376312, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0367, + "step": 14410 + }, + { + "epoch": 0.41229449606862045, + "grad_norm": 0.5898434519767761, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0376, + "step": 14420 + }, + { + "epoch": 0.4125804145818442, + "grad_norm": 0.8741532564163208, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0419, + "step": 14430 + }, + { + "epoch": 0.41286633309506793, + "grad_norm": 0.24328190088272095, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0333, + "step": 14440 + }, + { + "epoch": 0.4131522516082916, + "grad_norm": 0.4263601303100586, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.039, + "step": 14450 + }, + { + "epoch": 0.41343817012151535, + "grad_norm": 0.6311615109443665, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0454, + "step": 14460 + }, + { + "epoch": 0.4137240886347391, + "grad_norm": 0.7424519658088684, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0392, + "step": 14470 + }, + { + "epoch": 0.4140100071479628, + "grad_norm": 0.48323145508766174, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0374, + "step": 14480 + }, + { + "epoch": 0.41429592566118656, + "grad_norm": 0.38597407937049866, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0393, + "step": 14490 + }, + { + "epoch": 0.4145818441744103, + "grad_norm": 0.7251518964767456, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0431, + "step": 14500 + }, + { + "epoch": 0.41486776268763403, + "grad_norm": 0.44361060857772827, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0426, + "step": 14510 + }, + { + "epoch": 0.41515368120085777, + "grad_norm": 0.5625014305114746, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0372, + "step": 14520 + }, + { + "epoch": 0.4154395997140815, + "grad_norm": 0.27855798602104187, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 0.4157255182273052, + "grad_norm": 0.5966296195983887, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0387, + "step": 14540 + }, + { + "epoch": 0.41601143674052893, + "grad_norm": 0.49445512890815735, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0355, + "step": 14550 + }, + { + "epoch": 0.41629735525375267, + "grad_norm": 0.3813278377056122, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0456, + "step": 14560 + }, + { + "epoch": 0.4165832737669764, + "grad_norm": 0.5962988138198853, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0401, + "step": 14570 + }, + { + "epoch": 0.41686919228020014, + "grad_norm": 0.4028547406196594, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0371, + "step": 14580 + }, + { + "epoch": 0.4171551107934239, + "grad_norm": 1.348706841468811, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0426, + "step": 14590 + }, + { + "epoch": 0.4174410293066476, + "grad_norm": 1.2782070636749268, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0393, + "step": 14600 + }, + { + "epoch": 0.41772694781987135, + "grad_norm": 1.0024999380111694, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0436, + "step": 14610 + }, + { + "epoch": 0.4180128663330951, + "grad_norm": 0.35450127720832825, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0411, + "step": 14620 + }, + { + "epoch": 0.41829878484631877, + "grad_norm": 0.5827250480651855, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0372, + "step": 14630 + }, + { + "epoch": 0.4185847033595425, + "grad_norm": 0.5905774235725403, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0394, + "step": 14640 + }, + { + "epoch": 0.41887062187276625, + "grad_norm": 0.652074933052063, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0405, + "step": 14650 + }, + { + "epoch": 0.41915654038599, + "grad_norm": 0.7245490550994873, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0473, + "step": 14660 + }, + { + "epoch": 0.4194424588992137, + "grad_norm": 0.5153012871742249, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.043, + "step": 14670 + }, + { + "epoch": 0.41972837741243746, + "grad_norm": 0.516107976436615, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0434, + "step": 14680 + }, + { + "epoch": 0.4200142959256612, + "grad_norm": 0.4743354618549347, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0429, + "step": 14690 + }, + { + "epoch": 0.42030021443888493, + "grad_norm": 0.547875165939331, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0395, + "step": 14700 + }, + { + "epoch": 0.42058613295210867, + "grad_norm": 0.6398400068283081, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0384, + "step": 14710 + }, + { + "epoch": 0.42087205146533235, + "grad_norm": 0.5891467332839966, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0399, + "step": 14720 + }, + { + "epoch": 0.4211579699785561, + "grad_norm": 0.3927595615386963, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0353, + "step": 14730 + }, + { + "epoch": 0.42144388849177983, + "grad_norm": 0.6477030515670776, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0492, + "step": 14740 + }, + { + "epoch": 0.42172980700500357, + "grad_norm": 0.7090615034103394, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.042, + "step": 14750 + }, + { + "epoch": 0.4220157255182273, + "grad_norm": 0.6572134494781494, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0406, + "step": 14760 + }, + { + "epoch": 0.42230164403145104, + "grad_norm": 0.787663996219635, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0424, + "step": 14770 + }, + { + "epoch": 0.4225875625446748, + "grad_norm": 0.8419309258460999, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0427, + "step": 14780 + }, + { + "epoch": 0.4228734810578985, + "grad_norm": 0.6204128861427307, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0364, + "step": 14790 + }, + { + "epoch": 0.42315939957112225, + "grad_norm": 0.7446070313453674, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 0.42344531808434593, + "grad_norm": 0.7446451783180237, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0384, + "step": 14810 + }, + { + "epoch": 0.42373123659756967, + "grad_norm": 0.6946475505828857, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0375, + "step": 14820 + }, + { + "epoch": 0.4240171551107934, + "grad_norm": 0.6997008323669434, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0393, + "step": 14830 + }, + { + "epoch": 0.42430307362401715, + "grad_norm": 0.4857316315174103, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0474, + "step": 14840 + }, + { + "epoch": 0.4245889921372409, + "grad_norm": 1.3516888618469238, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.047, + "step": 14850 + }, + { + "epoch": 0.4248749106504646, + "grad_norm": 0.40320220589637756, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0418, + "step": 14860 + }, + { + "epoch": 0.42516082916368836, + "grad_norm": 0.9002796411514282, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0434, + "step": 14870 + }, + { + "epoch": 0.4254467476769121, + "grad_norm": 0.3810071349143982, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0338, + "step": 14880 + }, + { + "epoch": 0.42573266619013583, + "grad_norm": 0.5786157250404358, + "learning_rate": 1.159527607963768e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.4260185847033595, + "grad_norm": 0.6316869258880615, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0388, + "step": 14900 + }, + { + "epoch": 0.42630450321658325, + "grad_norm": 0.608745276927948, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0426, + "step": 14910 + }, + { + "epoch": 0.426590421729807, + "grad_norm": 0.6655036807060242, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0433, + "step": 14920 + }, + { + "epoch": 0.4268763402430307, + "grad_norm": 0.29059523344039917, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0507, + "step": 14930 + }, + { + "epoch": 0.42716225875625446, + "grad_norm": 0.9066076278686523, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.4274481772694782, + "grad_norm": 1.0660220384597778, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0512, + "step": 14950 + }, + { + "epoch": 0.42773409578270194, + "grad_norm": 0.6081144213676453, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0426, + "step": 14960 + }, + { + "epoch": 0.4280200142959257, + "grad_norm": 0.46524369716644287, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0435, + "step": 14970 + }, + { + "epoch": 0.4283059328091494, + "grad_norm": 0.3497388958930969, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0492, + "step": 14980 + }, + { + "epoch": 0.4285918513223731, + "grad_norm": 0.41300803422927856, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 0.42887776983559683, + "grad_norm": 0.4363289177417755, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0358, + "step": 15000 + }, + { + "epoch": 0.42916368834882057, + "grad_norm": 1.314915418624878, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.047, + "step": 15010 + }, + { + "epoch": 0.4294496068620443, + "grad_norm": 0.558199942111969, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0313, + "step": 15020 + }, + { + "epoch": 0.42973552537526805, + "grad_norm": 0.3857463598251343, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0416, + "step": 15030 + }, + { + "epoch": 0.4300214438884918, + "grad_norm": 0.4701749384403229, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0425, + "step": 15040 + }, + { + "epoch": 0.4303073624017155, + "grad_norm": 0.4611213803291321, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0457, + "step": 15050 + }, + { + "epoch": 0.43059328091493926, + "grad_norm": 0.5338016152381897, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.038, + "step": 15060 + }, + { + "epoch": 0.430879199428163, + "grad_norm": 0.9078943133354187, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0395, + "step": 15070 + }, + { + "epoch": 0.4311651179413867, + "grad_norm": 0.5354048013687134, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0403, + "step": 15080 + }, + { + "epoch": 0.4314510364546104, + "grad_norm": 0.35511279106140137, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0377, + "step": 15090 + }, + { + "epoch": 0.43173695496783415, + "grad_norm": 0.37104350328445435, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0426, + "step": 15100 + }, + { + "epoch": 0.4320228734810579, + "grad_norm": 0.8916210532188416, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0387, + "step": 15110 + }, + { + "epoch": 0.4323087919942816, + "grad_norm": 0.514994740486145, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0384, + "step": 15120 + }, + { + "epoch": 0.43259471050750536, + "grad_norm": 0.8440690040588379, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0437, + "step": 15130 + }, + { + "epoch": 0.4328806290207291, + "grad_norm": 0.6815949082374573, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0453, + "step": 15140 + }, + { + "epoch": 0.43316654753395284, + "grad_norm": 0.33178189396858215, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0351, + "step": 15150 + }, + { + "epoch": 0.4334524660471766, + "grad_norm": 0.5686727166175842, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0368, + "step": 15160 + }, + { + "epoch": 0.43373838456040026, + "grad_norm": 0.44143930077552795, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0443, + "step": 15170 + }, + { + "epoch": 0.434024303073624, + "grad_norm": 0.3238232135772705, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0348, + "step": 15180 + }, + { + "epoch": 0.43431022158684773, + "grad_norm": 0.5038242340087891, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0343, + "step": 15190 + }, + { + "epoch": 0.43459614010007147, + "grad_norm": 0.4904351234436035, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0397, + "step": 15200 + }, + { + "epoch": 0.4348820586132952, + "grad_norm": 0.5325750708580017, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0499, + "step": 15210 + }, + { + "epoch": 0.43516797712651895, + "grad_norm": 0.39443954825401306, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 0.4354538956397427, + "grad_norm": 0.6782003045082092, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0358, + "step": 15230 + }, + { + "epoch": 0.4357398141529664, + "grad_norm": 0.47862571477890015, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0418, + "step": 15240 + }, + { + "epoch": 0.43602573266619016, + "grad_norm": 1.6515535116195679, + "learning_rate": 1.124468908014616e-05, + "loss": 0.043, + "step": 15250 + }, + { + "epoch": 0.43631165117941384, + "grad_norm": 0.4902660846710205, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0371, + "step": 15260 + }, + { + "epoch": 0.4365975696926376, + "grad_norm": 0.5742762088775635, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0369, + "step": 15270 + }, + { + "epoch": 0.4368834882058613, + "grad_norm": 0.42058590054512024, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0378, + "step": 15280 + }, + { + "epoch": 0.43716940671908505, + "grad_norm": 0.43729284405708313, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0352, + "step": 15290 + }, + { + "epoch": 0.4374553252323088, + "grad_norm": 0.4689466953277588, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0433, + "step": 15300 + }, + { + "epoch": 0.4377412437455325, + "grad_norm": 0.6272432208061218, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0548, + "step": 15310 + }, + { + "epoch": 0.43802716225875626, + "grad_norm": 1.1129611730575562, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0437, + "step": 15320 + }, + { + "epoch": 0.43831308077198, + "grad_norm": 0.9332655072212219, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0503, + "step": 15330 + }, + { + "epoch": 0.43859899928520374, + "grad_norm": 0.35150477290153503, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0351, + "step": 15340 + }, + { + "epoch": 0.4388849177984274, + "grad_norm": 0.3826565444469452, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0361, + "step": 15350 + }, + { + "epoch": 0.43917083631165116, + "grad_norm": 0.817319393157959, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0352, + "step": 15360 + }, + { + "epoch": 0.4394567548248749, + "grad_norm": 0.4379598796367645, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0469, + "step": 15370 + }, + { + "epoch": 0.43974267333809863, + "grad_norm": 0.6475314497947693, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0456, + "step": 15380 + }, + { + "epoch": 0.44002859185132237, + "grad_norm": 0.529088020324707, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0453, + "step": 15390 + }, + { + "epoch": 0.4403145103645461, + "grad_norm": 0.4915194809436798, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0369, + "step": 15400 + }, + { + "epoch": 0.44060042887776985, + "grad_norm": 0.4766380786895752, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0391, + "step": 15410 + }, + { + "epoch": 0.4408863473909936, + "grad_norm": 0.34667786955833435, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0327, + "step": 15420 + }, + { + "epoch": 0.4411722659042173, + "grad_norm": 0.504242479801178, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0413, + "step": 15430 + }, + { + "epoch": 0.441458184417441, + "grad_norm": 0.49786439538002014, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0361, + "step": 15440 + }, + { + "epoch": 0.44174410293066474, + "grad_norm": 0.4997329115867615, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0368, + "step": 15450 + }, + { + "epoch": 0.4420300214438885, + "grad_norm": 0.2992185056209564, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0359, + "step": 15460 + }, + { + "epoch": 0.4423159399571122, + "grad_norm": 0.6645393371582031, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0401, + "step": 15470 + }, + { + "epoch": 0.44260185847033595, + "grad_norm": 0.6327983140945435, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0386, + "step": 15480 + }, + { + "epoch": 0.4428877769835597, + "grad_norm": 0.45607903599739075, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 0.4431736954967834, + "grad_norm": 0.4401610493659973, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0417, + "step": 15500 + }, + { + "epoch": 0.44345961401000716, + "grad_norm": 0.5778466463088989, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0417, + "step": 15510 + }, + { + "epoch": 0.4437455325232309, + "grad_norm": 0.2164914309978485, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0355, + "step": 15520 + }, + { + "epoch": 0.4440314510364546, + "grad_norm": 0.3869318664073944, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0361, + "step": 15530 + }, + { + "epoch": 0.4443173695496783, + "grad_norm": 0.3843154311180115, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0459, + "step": 15540 + }, + { + "epoch": 0.44460328806290206, + "grad_norm": 0.8488825559616089, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0406, + "step": 15550 + }, + { + "epoch": 0.4448892065761258, + "grad_norm": 0.5055183172225952, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0359, + "step": 15560 + }, + { + "epoch": 0.44517512508934953, + "grad_norm": 0.40923011302948, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0435, + "step": 15570 + }, + { + "epoch": 0.44546104360257327, + "grad_norm": 0.48997730016708374, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0395, + "step": 15580 + }, + { + "epoch": 0.445746962115797, + "grad_norm": 0.5149131417274475, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.041, + "step": 15590 + }, + { + "epoch": 0.44603288062902074, + "grad_norm": 0.7277303338050842, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0452, + "step": 15600 + }, + { + "epoch": 0.4463187991422445, + "grad_norm": 0.48676377534866333, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0363, + "step": 15610 + }, + { + "epoch": 0.44660471765546816, + "grad_norm": 0.49031221866607666, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0356, + "step": 15620 + }, + { + "epoch": 0.4468906361686919, + "grad_norm": 0.38877514004707336, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.036, + "step": 15630 + }, + { + "epoch": 0.44717655468191564, + "grad_norm": 0.570068895816803, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0403, + "step": 15640 + }, + { + "epoch": 0.4474624731951394, + "grad_norm": 0.48499882221221924, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0395, + "step": 15650 + }, + { + "epoch": 0.4477483917083631, + "grad_norm": 0.7251732349395752, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0399, + "step": 15660 + }, + { + "epoch": 0.44803431022158685, + "grad_norm": 0.3927334249019623, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0359, + "step": 15670 + }, + { + "epoch": 0.4483202287348106, + "grad_norm": 0.5614549517631531, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.035, + "step": 15680 + }, + { + "epoch": 0.4486061472480343, + "grad_norm": 0.383831262588501, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0416, + "step": 15690 + }, + { + "epoch": 0.44889206576125806, + "grad_norm": 1.9365276098251343, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0498, + "step": 15700 + }, + { + "epoch": 0.44917798427448175, + "grad_norm": 0.6964924931526184, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.034, + "step": 15710 + }, + { + "epoch": 0.4494639027877055, + "grad_norm": 0.5148108601570129, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0401, + "step": 15720 + }, + { + "epoch": 0.4497498213009292, + "grad_norm": 0.4529317617416382, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0361, + "step": 15730 + }, + { + "epoch": 0.45003573981415296, + "grad_norm": 0.6648512482643127, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0365, + "step": 15740 + }, + { + "epoch": 0.4503216583273767, + "grad_norm": 0.8183113932609558, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0416, + "step": 15750 + }, + { + "epoch": 0.45060757684060043, + "grad_norm": 0.8802638649940491, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0406, + "step": 15760 + }, + { + "epoch": 0.45089349535382417, + "grad_norm": 0.6329004764556885, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0395, + "step": 15770 + }, + { + "epoch": 0.4511794138670479, + "grad_norm": 0.35283520817756653, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0364, + "step": 15780 + }, + { + "epoch": 0.45146533238027164, + "grad_norm": 0.5156061053276062, + "learning_rate": 1.071827766589186e-05, + "loss": 0.031, + "step": 15790 + }, + { + "epoch": 0.4517512508934953, + "grad_norm": 0.37875205278396606, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0375, + "step": 15800 + }, + { + "epoch": 0.45203716940671906, + "grad_norm": 0.5543273687362671, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0421, + "step": 15810 + }, + { + "epoch": 0.4523230879199428, + "grad_norm": 0.3808431923389435, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 0.45260900643316654, + "grad_norm": 0.8648643493652344, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0396, + "step": 15830 + }, + { + "epoch": 0.4528949249463903, + "grad_norm": 0.7893536686897278, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0417, + "step": 15840 + }, + { + "epoch": 0.453180843459614, + "grad_norm": 0.904137134552002, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0384, + "step": 15850 + }, + { + "epoch": 0.45346676197283775, + "grad_norm": 0.6095889806747437, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0457, + "step": 15860 + }, + { + "epoch": 0.4537526804860615, + "grad_norm": 0.5691415667533875, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0438, + "step": 15870 + }, + { + "epoch": 0.4540385989992852, + "grad_norm": 0.37868618965148926, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0414, + "step": 15880 + }, + { + "epoch": 0.4543245175125089, + "grad_norm": 0.7962950468063354, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0405, + "step": 15890 + }, + { + "epoch": 0.45461043602573264, + "grad_norm": 0.8862378597259521, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0475, + "step": 15900 + }, + { + "epoch": 0.4548963545389564, + "grad_norm": 0.8762509822845459, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0472, + "step": 15910 + }, + { + "epoch": 0.4551822730521801, + "grad_norm": 0.6006313562393188, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0417, + "step": 15920 + }, + { + "epoch": 0.45546819156540386, + "grad_norm": 0.3340131938457489, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0374, + "step": 15930 + }, + { + "epoch": 0.4557541100786276, + "grad_norm": 0.2639552056789398, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0387, + "step": 15940 + }, + { + "epoch": 0.45604002859185133, + "grad_norm": 0.42564907670021057, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0376, + "step": 15950 + }, + { + "epoch": 0.45632594710507507, + "grad_norm": 0.503834068775177, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0344, + "step": 15960 + }, + { + "epoch": 0.4566118656182988, + "grad_norm": 0.5962334871292114, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0379, + "step": 15970 + }, + { + "epoch": 0.4568977841315225, + "grad_norm": 0.3271556794643402, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0361, + "step": 15980 + }, + { + "epoch": 0.4571837026447462, + "grad_norm": 0.5501612424850464, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0356, + "step": 15990 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 1.0399914979934692, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.039, + "step": 16000 + }, + { + "epoch": 0.4577555396711937, + "grad_norm": 0.42251288890838623, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0413, + "step": 16010 + }, + { + "epoch": 0.45804145818441744, + "grad_norm": 0.5694882869720459, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0501, + "step": 16020 + }, + { + "epoch": 0.4583273766976412, + "grad_norm": 0.37367814779281616, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0388, + "step": 16030 + }, + { + "epoch": 0.4586132952108649, + "grad_norm": 0.7947224974632263, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0324, + "step": 16040 + }, + { + "epoch": 0.45889921372408865, + "grad_norm": 0.47871798276901245, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0345, + "step": 16050 + }, + { + "epoch": 0.4591851322373124, + "grad_norm": 1.4443609714508057, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0502, + "step": 16060 + }, + { + "epoch": 0.45947105075053607, + "grad_norm": 0.8326191902160645, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0325, + "step": 16070 + }, + { + "epoch": 0.4597569692637598, + "grad_norm": 0.2887400686740875, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.035, + "step": 16080 + }, + { + "epoch": 0.46004288777698354, + "grad_norm": 0.34353405237197876, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0324, + "step": 16090 + }, + { + "epoch": 0.4603288062902073, + "grad_norm": 0.7319850325584412, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0307, + "step": 16100 + }, + { + "epoch": 0.460614724803431, + "grad_norm": 0.6628556847572327, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0398, + "step": 16110 + }, + { + "epoch": 0.46090064331665476, + "grad_norm": 0.39974722266197205, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.038, + "step": 16120 + }, + { + "epoch": 0.4611865618298785, + "grad_norm": 0.7769339680671692, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0425, + "step": 16130 + }, + { + "epoch": 0.46147248034310223, + "grad_norm": 0.6823691129684448, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.039, + "step": 16140 + }, + { + "epoch": 0.46175839885632597, + "grad_norm": 0.6749460697174072, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0388, + "step": 16150 + }, + { + "epoch": 0.46204431736954965, + "grad_norm": 1.0745635032653809, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0406, + "step": 16160 + }, + { + "epoch": 0.4623302358827734, + "grad_norm": 0.8388734459877014, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0345, + "step": 16170 + }, + { + "epoch": 0.4626161543959971, + "grad_norm": 0.675828218460083, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0355, + "step": 16180 + }, + { + "epoch": 0.46290207290922086, + "grad_norm": 0.9872504472732544, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0374, + "step": 16190 + }, + { + "epoch": 0.4631879914224446, + "grad_norm": 0.4705125689506531, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0416, + "step": 16200 + }, + { + "epoch": 0.46347390993566834, + "grad_norm": 0.43577539920806885, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.041, + "step": 16210 + }, + { + "epoch": 0.4637598284488921, + "grad_norm": 0.6472166180610657, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0372, + "step": 16220 + }, + { + "epoch": 0.4640457469621158, + "grad_norm": 1.0108906030654907, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0464, + "step": 16230 + }, + { + "epoch": 0.46433166547533955, + "grad_norm": 0.6221884489059448, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0396, + "step": 16240 + }, + { + "epoch": 0.46461758398856323, + "grad_norm": 0.7375202178955078, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0365, + "step": 16250 + }, + { + "epoch": 0.46490350250178697, + "grad_norm": 0.5090222358703613, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0404, + "step": 16260 + }, + { + "epoch": 0.4651894210150107, + "grad_norm": 0.5641722679138184, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0424, + "step": 16270 + }, + { + "epoch": 0.46547533952823444, + "grad_norm": 0.3946240246295929, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0433, + "step": 16280 + }, + { + "epoch": 0.4657612580414582, + "grad_norm": 0.525059700012207, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0399, + "step": 16290 + }, + { + "epoch": 0.4660471765546819, + "grad_norm": 0.6106441617012024, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0417, + "step": 16300 + }, + { + "epoch": 0.46633309506790566, + "grad_norm": 0.7064299583435059, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0331, + "step": 16310 + }, + { + "epoch": 0.4666190135811294, + "grad_norm": 0.6251654624938965, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0377, + "step": 16320 + }, + { + "epoch": 0.46690493209435313, + "grad_norm": 0.6626482009887695, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0355, + "step": 16330 + }, + { + "epoch": 0.4671908506075768, + "grad_norm": 0.32827794551849365, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0438, + "step": 16340 + }, + { + "epoch": 0.46747676912080055, + "grad_norm": 1.147644281387329, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.041, + "step": 16350 + }, + { + "epoch": 0.4677626876340243, + "grad_norm": 0.5785626769065857, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0362, + "step": 16360 + }, + { + "epoch": 0.468048606147248, + "grad_norm": 0.7087936401367188, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0364, + "step": 16370 + }, + { + "epoch": 0.46833452466047176, + "grad_norm": 0.7729533314704895, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0357, + "step": 16380 + }, + { + "epoch": 0.4686204431736955, + "grad_norm": 0.9080077409744263, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0445, + "step": 16390 + }, + { + "epoch": 0.46890636168691924, + "grad_norm": 0.5273067355155945, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0395, + "step": 16400 + }, + { + "epoch": 0.469192280200143, + "grad_norm": 0.4801991581916809, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0469, + "step": 16410 + }, + { + "epoch": 0.4694781987133667, + "grad_norm": 0.38060688972473145, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0377, + "step": 16420 + }, + { + "epoch": 0.4697641172265904, + "grad_norm": 1.335648536682129, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0444, + "step": 16430 + }, + { + "epoch": 0.47005003573981413, + "grad_norm": 0.6224690079689026, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0365, + "step": 16440 + }, + { + "epoch": 0.47033595425303787, + "grad_norm": 0.39938899874687195, + "learning_rate": 1.007637577910799e-05, + "loss": 0.037, + "step": 16450 + }, + { + "epoch": 0.4706218727662616, + "grad_norm": 0.47899872064590454, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0371, + "step": 16460 + }, + { + "epoch": 0.47090779127948534, + "grad_norm": 0.8991144895553589, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0337, + "step": 16470 + }, + { + "epoch": 0.4711937097927091, + "grad_norm": 0.6228598356246948, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0388, + "step": 16480 + }, + { + "epoch": 0.4714796283059328, + "grad_norm": 0.41108259558677673, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0378, + "step": 16490 + }, + { + "epoch": 0.47176554681915656, + "grad_norm": 0.722955048084259, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0381, + "step": 16500 + }, + { + "epoch": 0.4720514653323803, + "grad_norm": 0.6090973019599915, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0348, + "step": 16510 + }, + { + "epoch": 0.472337383845604, + "grad_norm": 0.483549565076828, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0456, + "step": 16520 + }, + { + "epoch": 0.4726233023588277, + "grad_norm": 0.4134727418422699, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0444, + "step": 16530 + }, + { + "epoch": 0.47290922087205145, + "grad_norm": 0.4629753530025482, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0382, + "step": 16540 + }, + { + "epoch": 0.4731951393852752, + "grad_norm": 0.8709504008293152, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0384, + "step": 16550 + }, + { + "epoch": 0.4734810578984989, + "grad_norm": 0.683397114276886, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0398, + "step": 16560 + }, + { + "epoch": 0.47376697641172266, + "grad_norm": 0.5743465423583984, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0431, + "step": 16570 + }, + { + "epoch": 0.4740528949249464, + "grad_norm": 1.0080480575561523, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0378, + "step": 16580 + }, + { + "epoch": 0.47433881343817014, + "grad_norm": 0.4668700098991394, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0369, + "step": 16590 + }, + { + "epoch": 0.4746247319513939, + "grad_norm": 0.6005896925926208, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0508, + "step": 16600 + }, + { + "epoch": 0.47491065046461756, + "grad_norm": 0.5788530707359314, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0354, + "step": 16610 + }, + { + "epoch": 0.4751965689778413, + "grad_norm": 0.38784441351890564, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0357, + "step": 16620 + }, + { + "epoch": 0.47548248749106503, + "grad_norm": 0.4809567928314209, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0331, + "step": 16630 + }, + { + "epoch": 0.47576840600428877, + "grad_norm": 0.6647809147834778, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0473, + "step": 16640 + }, + { + "epoch": 0.4760543245175125, + "grad_norm": 0.3968522548675537, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0304, + "step": 16650 + }, + { + "epoch": 0.47634024303073624, + "grad_norm": 0.3258526027202606, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0387, + "step": 16660 + }, + { + "epoch": 0.47662616154396, + "grad_norm": 0.43442079424858093, + "learning_rate": 9.863295834019308e-06, + "loss": 0.04, + "step": 16670 + }, + { + "epoch": 0.4769120800571837, + "grad_norm": 0.36909565329551697, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0351, + "step": 16680 + }, + { + "epoch": 0.47719799857040746, + "grad_norm": 0.5566768050193787, + "learning_rate": 9.843955128197274e-06, + "loss": 0.031, + "step": 16690 + }, + { + "epoch": 0.47748391708363114, + "grad_norm": 0.5705142617225647, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0359, + "step": 16700 + }, + { + "epoch": 0.4777698355968549, + "grad_norm": 0.28931716084480286, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0407, + "step": 16710 + }, + { + "epoch": 0.4780557541100786, + "grad_norm": 0.5509498715400696, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0363, + "step": 16720 + }, + { + "epoch": 0.47834167262330235, + "grad_norm": 0.3564346432685852, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0364, + "step": 16730 + }, + { + "epoch": 0.4786275911365261, + "grad_norm": 0.32734423875808716, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0369, + "step": 16740 + }, + { + "epoch": 0.4789135096497498, + "grad_norm": 0.3048594892024994, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0367, + "step": 16750 + }, + { + "epoch": 0.47919942816297356, + "grad_norm": 0.9007049798965454, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0377, + "step": 16760 + }, + { + "epoch": 0.4794853466761973, + "grad_norm": 0.7010983824729919, + "learning_rate": 9.76664747972605e-06, + "loss": 0.039, + "step": 16770 + }, + { + "epoch": 0.47977126518942104, + "grad_norm": 0.644473135471344, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0466, + "step": 16780 + }, + { + "epoch": 0.4800571837026447, + "grad_norm": 0.6333492398262024, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0373, + "step": 16790 + }, + { + "epoch": 0.48034310221586846, + "grad_norm": 0.5148355960845947, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0392, + "step": 16800 + }, + { + "epoch": 0.4806290207290922, + "grad_norm": 0.7288355231285095, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0381, + "step": 16810 + }, + { + "epoch": 0.48091493924231593, + "grad_norm": 0.3674873113632202, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0418, + "step": 16820 + }, + { + "epoch": 0.48120085775553967, + "grad_norm": 0.5055420398712158, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0336, + "step": 16830 + }, + { + "epoch": 0.4814867762687634, + "grad_norm": 0.641754686832428, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0342, + "step": 16840 + }, + { + "epoch": 0.48177269478198714, + "grad_norm": 0.308200478553772, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0364, + "step": 16850 + }, + { + "epoch": 0.4820586132952109, + "grad_norm": 0.41361021995544434, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0342, + "step": 16860 + }, + { + "epoch": 0.4823445318084346, + "grad_norm": 0.45777833461761475, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0353, + "step": 16870 + }, + { + "epoch": 0.4826304503216583, + "grad_norm": 0.7587664723396301, + "learning_rate": 9.660501900166734e-06, + "loss": 0.043, + "step": 16880 + }, + { + "epoch": 0.48291636883488204, + "grad_norm": 0.8740283250808716, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0372, + "step": 16890 + }, + { + "epoch": 0.4832022873481058, + "grad_norm": 0.3009270429611206, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0373, + "step": 16900 + }, + { + "epoch": 0.4834882058613295, + "grad_norm": 0.4439285695552826, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0349, + "step": 16910 + }, + { + "epoch": 0.48377412437455325, + "grad_norm": 0.39849671721458435, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0394, + "step": 16920 + }, + { + "epoch": 0.484060042887777, + "grad_norm": 0.6423043608665466, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0413, + "step": 16930 + }, + { + "epoch": 0.4843459614010007, + "grad_norm": 0.3683928847312927, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0387, + "step": 16940 + }, + { + "epoch": 0.48463187991422446, + "grad_norm": 0.7087769508361816, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0397, + "step": 16950 + }, + { + "epoch": 0.4849177984274482, + "grad_norm": 0.5348120927810669, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0405, + "step": 16960 + }, + { + "epoch": 0.4852037169406719, + "grad_norm": 0.549891471862793, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0363, + "step": 16970 + }, + { + "epoch": 0.4854896354538956, + "grad_norm": 0.7177272439002991, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0343, + "step": 16980 + }, + { + "epoch": 0.48577555396711936, + "grad_norm": 0.595417320728302, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0439, + "step": 16990 + }, + { + "epoch": 0.4860614724803431, + "grad_norm": 0.4838889241218567, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0387, + "step": 17000 + }, + { + "epoch": 0.48634739099356683, + "grad_norm": 0.6186223030090332, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0362, + "step": 17010 + }, + { + "epoch": 0.48663330950679057, + "grad_norm": 0.43383121490478516, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0381, + "step": 17020 + }, + { + "epoch": 0.4869192280200143, + "grad_norm": 0.6735527515411377, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0388, + "step": 17030 + }, + { + "epoch": 0.48720514653323804, + "grad_norm": 0.3746320605278015, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0491, + "step": 17040 + }, + { + "epoch": 0.4874910650464618, + "grad_norm": 0.29500988125801086, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0395, + "step": 17050 + }, + { + "epoch": 0.48777698355968546, + "grad_norm": 0.8518465757369995, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0435, + "step": 17060 + }, + { + "epoch": 0.4880629020729092, + "grad_norm": 0.9653190970420837, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0393, + "step": 17070 + }, + { + "epoch": 0.48834882058613294, + "grad_norm": 0.785724937915802, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0372, + "step": 17080 + }, + { + "epoch": 0.4886347390993567, + "grad_norm": 0.9450638890266418, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0406, + "step": 17090 + }, + { + "epoch": 0.4889206576125804, + "grad_norm": 0.645124077796936, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0361, + "step": 17100 + }, + { + "epoch": 0.48920657612580415, + "grad_norm": 0.3352372944355011, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0417, + "step": 17110 + }, + { + "epoch": 0.4894924946390279, + "grad_norm": 0.3858814835548401, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0345, + "step": 17120 + }, + { + "epoch": 0.4897784131522516, + "grad_norm": 0.5403604507446289, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0326, + "step": 17130 + }, + { + "epoch": 0.49006433166547536, + "grad_norm": 0.6986777782440186, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0417, + "step": 17140 + }, + { + "epoch": 0.49035025017869904, + "grad_norm": 0.5456675887107849, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0473, + "step": 17150 + }, + { + "epoch": 0.4906361686919228, + "grad_norm": 0.3961554765701294, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0341, + "step": 17160 + }, + { + "epoch": 0.4909220872051465, + "grad_norm": 0.5188277363777161, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0369, + "step": 17170 + }, + { + "epoch": 0.49120800571837026, + "grad_norm": 0.6042230725288391, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0352, + "step": 17180 + }, + { + "epoch": 0.491493924231594, + "grad_norm": 0.5485941171646118, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0405, + "step": 17190 + }, + { + "epoch": 0.49177984274481773, + "grad_norm": 0.5856509804725647, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0402, + "step": 17200 + }, + { + "epoch": 0.49206576125804147, + "grad_norm": 0.8656556010246277, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0349, + "step": 17210 + }, + { + "epoch": 0.4923516797712652, + "grad_norm": 0.4041757583618164, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0364, + "step": 17220 + }, + { + "epoch": 0.49263759828448894, + "grad_norm": 0.6135975122451782, + "learning_rate": 9.324104146177972e-06, + "loss": 0.036, + "step": 17230 + }, + { + "epoch": 0.4929235167977126, + "grad_norm": 0.5101860165596008, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0359, + "step": 17240 + }, + { + "epoch": 0.49320943531093636, + "grad_norm": 0.9913426041603088, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0552, + "step": 17250 + }, + { + "epoch": 0.4934953538241601, + "grad_norm": 0.6148158311843872, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0388, + "step": 17260 + }, + { + "epoch": 0.49378127233738384, + "grad_norm": 0.6651721596717834, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0374, + "step": 17270 + }, + { + "epoch": 0.4940671908506076, + "grad_norm": 0.9545061588287354, + "learning_rate": 9.276232738281744e-06, + "loss": 0.035, + "step": 17280 + }, + { + "epoch": 0.4943531093638313, + "grad_norm": 0.8923225402832031, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0366, + "step": 17290 + }, + { + "epoch": 0.49463902787705505, + "grad_norm": 0.5337848663330078, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0354, + "step": 17300 + }, + { + "epoch": 0.4949249463902788, + "grad_norm": 0.35039281845092773, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0341, + "step": 17310 + }, + { + "epoch": 0.4952108649035025, + "grad_norm": 0.47406911849975586, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0393, + "step": 17320 + }, + { + "epoch": 0.4954967834167262, + "grad_norm": 0.6226631999015808, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0375, + "step": 17330 + }, + { + "epoch": 0.49578270192994994, + "grad_norm": 0.6652712821960449, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0363, + "step": 17340 + }, + { + "epoch": 0.4960686204431737, + "grad_norm": 1.0042835474014282, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0368, + "step": 17350 + }, + { + "epoch": 0.4963545389563974, + "grad_norm": 0.4334045648574829, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0375, + "step": 17360 + }, + { + "epoch": 0.49664045746962115, + "grad_norm": 0.3561633229255676, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0347, + "step": 17370 + }, + { + "epoch": 0.4969263759828449, + "grad_norm": 0.5763550996780396, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0344, + "step": 17380 + }, + { + "epoch": 0.49721229449606863, + "grad_norm": 0.6306643486022949, + "learning_rate": 9.171095634265995e-06, + "loss": 0.037, + "step": 17390 + }, + { + "epoch": 0.49749821300929237, + "grad_norm": 0.4286569058895111, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0308, + "step": 17400 + }, + { + "epoch": 0.4977841315225161, + "grad_norm": 0.577983558177948, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0393, + "step": 17410 + }, + { + "epoch": 0.4980700500357398, + "grad_norm": 0.5714932084083557, + "learning_rate": 9.142466323573853e-06, + "loss": 0.038, + "step": 17420 + }, + { + "epoch": 0.4983559685489635, + "grad_norm": 0.7529498338699341, + "learning_rate": 9.132927564918328e-06, + "loss": 0.033, + "step": 17430 + }, + { + "epoch": 0.49864188706218726, + "grad_norm": 0.5179672241210938, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0367, + "step": 17440 + }, + { + "epoch": 0.498927805575411, + "grad_norm": 0.38424569368362427, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0401, + "step": 17450 + }, + { + "epoch": 0.49921372408863474, + "grad_norm": 0.469460129737854, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0379, + "step": 17460 + }, + { + "epoch": 0.4994996426018585, + "grad_norm": 0.3285387456417084, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0399, + "step": 17470 + }, + { + "epoch": 0.4997855611150822, + "grad_norm": 0.49863550066947937, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0313, + "step": 17480 + }, + { + "epoch": 0.5000714796283059, + "grad_norm": 0.3926186263561249, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0454, + "step": 17490 + }, + { + "epoch": 0.5003573981415297, + "grad_norm": 0.4476146399974823, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0472, + "step": 17500 + }, + { + "epoch": 0.5006433166547534, + "grad_norm": 0.5645599961280823, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0358, + "step": 17510 + }, + { + "epoch": 0.5009292351679772, + "grad_norm": 0.4813307225704193, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0364, + "step": 17520 + }, + { + "epoch": 0.5012151536812008, + "grad_norm": 0.49410971999168396, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0303, + "step": 17530 + }, + { + "epoch": 0.5015010721944246, + "grad_norm": 0.7172105312347412, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0404, + "step": 17540 + }, + { + "epoch": 0.5017869907076483, + "grad_norm": 0.43401873111724854, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0402, + "step": 17550 + }, + { + "epoch": 0.502072909220872, + "grad_norm": 0.6497406363487244, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0364, + "step": 17560 + }, + { + "epoch": 0.5023588277340958, + "grad_norm": 0.44618356227874756, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0337, + "step": 17570 + }, + { + "epoch": 0.5026447462473195, + "grad_norm": 0.4186992049217224, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0381, + "step": 17580 + }, + { + "epoch": 0.5029306647605433, + "grad_norm": 0.7387974858283997, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0319, + "step": 17590 + }, + { + "epoch": 0.503216583273767, + "grad_norm": 0.8068642020225525, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0373, + "step": 17600 + }, + { + "epoch": 0.5035025017869907, + "grad_norm": 0.5773473978042603, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0372, + "step": 17610 + }, + { + "epoch": 0.5037884203002144, + "grad_norm": 0.32488778233528137, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0334, + "step": 17620 + }, + { + "epoch": 0.5040743388134382, + "grad_norm": 0.33978500962257385, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0493, + "step": 17630 + }, + { + "epoch": 0.5043602573266619, + "grad_norm": 0.5897071361541748, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0335, + "step": 17640 + }, + { + "epoch": 0.5046461758398856, + "grad_norm": 0.6275895833969116, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0395, + "step": 17650 + }, + { + "epoch": 0.5049320943531094, + "grad_norm": 0.7995536923408508, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0422, + "step": 17660 + }, + { + "epoch": 0.505218012866333, + "grad_norm": 0.8734716773033142, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0414, + "step": 17670 + }, + { + "epoch": 0.5055039313795568, + "grad_norm": 0.6239343881607056, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0333, + "step": 17680 + }, + { + "epoch": 0.5057898498927805, + "grad_norm": 0.42508623003959656, + "learning_rate": 8.885721609997551e-06, + "loss": 0.045, + "step": 17690 + }, + { + "epoch": 0.5060757684060043, + "grad_norm": 0.4272485673427582, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0506, + "step": 17700 + }, + { + "epoch": 0.506361686919228, + "grad_norm": 0.8006368279457092, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0431, + "step": 17710 + }, + { + "epoch": 0.5066476054324518, + "grad_norm": 0.5896835327148438, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0322, + "step": 17720 + }, + { + "epoch": 0.5069335239456755, + "grad_norm": 0.6880389451980591, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0322, + "step": 17730 + }, + { + "epoch": 0.5072194424588992, + "grad_norm": 1.4850202798843384, + "learning_rate": 8.83836825410936e-06, + "loss": 0.052, + "step": 17740 + }, + { + "epoch": 0.507505360972123, + "grad_norm": 0.7684240937232971, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0353, + "step": 17750 + }, + { + "epoch": 0.5077912794853466, + "grad_norm": 0.5456307530403137, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0419, + "step": 17760 + }, + { + "epoch": 0.5080771979985704, + "grad_norm": 0.5775120258331299, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0366, + "step": 17770 + }, + { + "epoch": 0.5083631165117941, + "grad_norm": 0.6453070044517517, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0341, + "step": 17780 + }, + { + "epoch": 0.5086490350250179, + "grad_norm": 0.7906973361968994, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0405, + "step": 17790 + }, + { + "epoch": 0.5089349535382416, + "grad_norm": 1.0740606784820557, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0344, + "step": 17800 + }, + { + "epoch": 0.5092208720514654, + "grad_norm": 0.41854357719421387, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0334, + "step": 17810 + }, + { + "epoch": 0.5095067905646891, + "grad_norm": 0.6328964233398438, + "learning_rate": 8.762735374981932e-06, + "loss": 0.036, + "step": 17820 + }, + { + "epoch": 0.5097927090779127, + "grad_norm": 0.40875789523124695, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0338, + "step": 17830 + }, + { + "epoch": 0.5100786275911365, + "grad_norm": 0.5056312084197998, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0332, + "step": 17840 + }, + { + "epoch": 0.5103645461043602, + "grad_norm": 0.5005037784576416, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0416, + "step": 17850 + }, + { + "epoch": 0.510650464617584, + "grad_norm": 0.5689167380332947, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0329, + "step": 17860 + }, + { + "epoch": 0.5109363831308077, + "grad_norm": 0.5222717523574829, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0336, + "step": 17870 + }, + { + "epoch": 0.5112223016440315, + "grad_norm": 0.5998329520225525, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0354, + "step": 17880 + }, + { + "epoch": 0.5115082201572552, + "grad_norm": 0.4684480130672455, + "learning_rate": 8.69669425266315e-06, + "loss": 0.05, + "step": 17890 + }, + { + "epoch": 0.511794138670479, + "grad_norm": 0.4061124622821808, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0384, + "step": 17900 + }, + { + "epoch": 0.5120800571837026, + "grad_norm": 0.5025928020477295, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0386, + "step": 17910 + }, + { + "epoch": 0.5123659756969263, + "grad_norm": 0.3731222152709961, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0378, + "step": 17920 + }, + { + "epoch": 0.5126518942101501, + "grad_norm": 0.7784973978996277, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0419, + "step": 17930 + }, + { + "epoch": 0.5129378127233738, + "grad_norm": 0.7074074745178223, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0386, + "step": 17940 + }, + { + "epoch": 0.5132237312365976, + "grad_norm": 0.49802306294441223, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0418, + "step": 17950 + }, + { + "epoch": 0.5135096497498213, + "grad_norm": 0.4355427920818329, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0431, + "step": 17960 + }, + { + "epoch": 0.5137955682630451, + "grad_norm": 0.672635555267334, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0403, + "step": 17970 + }, + { + "epoch": 0.5140814867762687, + "grad_norm": 0.6733908653259277, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0487, + "step": 17980 + }, + { + "epoch": 0.5143674052894925, + "grad_norm": 0.43711504340171814, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0378, + "step": 17990 + }, + { + "epoch": 0.5146533238027162, + "grad_norm": 0.6371222138404846, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0336, + "step": 18000 + }, + { + "epoch": 0.5149392423159399, + "grad_norm": 0.8007041811943054, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0371, + "step": 18010 + }, + { + "epoch": 0.5152251608291637, + "grad_norm": 0.4725078344345093, + "learning_rate": 8.574400723012433e-06, + "loss": 0.037, + "step": 18020 + }, + { + "epoch": 0.5155110793423874, + "grad_norm": 0.34229791164398193, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0353, + "step": 18030 + }, + { + "epoch": 0.5157969978556112, + "grad_norm": 0.27863454818725586, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0371, + "step": 18040 + }, + { + "epoch": 0.5160829163688349, + "grad_norm": 0.43021920323371887, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0419, + "step": 18050 + }, + { + "epoch": 0.5163688348820586, + "grad_norm": 0.4683758318424225, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0307, + "step": 18060 + }, + { + "epoch": 0.5166547533952823, + "grad_norm": 0.29085367918014526, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0372, + "step": 18070 + }, + { + "epoch": 0.5169406719085061, + "grad_norm": 0.4396727681159973, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0328, + "step": 18080 + }, + { + "epoch": 0.5172265904217298, + "grad_norm": 0.539021372795105, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0317, + "step": 18090 + }, + { + "epoch": 0.5175125089349535, + "grad_norm": 0.556974470615387, + "learning_rate": 8.499380733111628e-06, + "loss": 0.037, + "step": 18100 + }, + { + "epoch": 0.5177984274481773, + "grad_norm": 0.4445747137069702, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0344, + "step": 18110 + }, + { + "epoch": 0.518084345961401, + "grad_norm": 0.3742713928222656, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0339, + "step": 18120 + }, + { + "epoch": 0.5183702644746248, + "grad_norm": 0.8467416167259216, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0409, + "step": 18130 + }, + { + "epoch": 0.5186561829878484, + "grad_norm": 0.7731484770774841, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0379, + "step": 18140 + }, + { + "epoch": 0.5189421015010722, + "grad_norm": 0.5664084553718567, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0353, + "step": 18150 + }, + { + "epoch": 0.5192280200142959, + "grad_norm": 0.5623966455459595, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0412, + "step": 18160 + }, + { + "epoch": 0.5195139385275197, + "grad_norm": 0.5074556469917297, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0402, + "step": 18170 + }, + { + "epoch": 0.5197998570407434, + "grad_norm": 0.49439728260040283, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0341, + "step": 18180 + }, + { + "epoch": 0.5200857755539671, + "grad_norm": 0.5982527136802673, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0308, + "step": 18190 + }, + { + "epoch": 0.5203716940671909, + "grad_norm": 0.7891598343849182, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0437, + "step": 18200 + }, + { + "epoch": 0.5206576125804145, + "grad_norm": 0.7565666437149048, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0381, + "step": 18210 + }, + { + "epoch": 0.5209435310936383, + "grad_norm": 0.33346351981163025, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0454, + "step": 18220 + }, + { + "epoch": 0.521229449606862, + "grad_norm": 0.5885659456253052, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0413, + "step": 18230 + }, + { + "epoch": 0.5215153681200858, + "grad_norm": 0.6487091183662415, + "learning_rate": 8.368551060444755e-06, + "loss": 0.035, + "step": 18240 + }, + { + "epoch": 0.5218012866333095, + "grad_norm": 0.9817430377006531, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0394, + "step": 18250 + }, + { + "epoch": 0.5220872051465333, + "grad_norm": 0.5691193342208862, + "learning_rate": 8.349909816537207e-06, + "loss": 0.041, + "step": 18260 + }, + { + "epoch": 0.522373123659757, + "grad_norm": 0.5326661467552185, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0361, + "step": 18270 + }, + { + "epoch": 0.5226590421729806, + "grad_norm": 0.5536142587661743, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0406, + "step": 18280 + }, + { + "epoch": 0.5229449606862044, + "grad_norm": 0.3482394218444824, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0423, + "step": 18290 + }, + { + "epoch": 0.5232308791994281, + "grad_norm": 0.514914333820343, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0352, + "step": 18300 + }, + { + "epoch": 0.5235167977126519, + "grad_norm": 0.7681404948234558, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0386, + "step": 18310 + }, + { + "epoch": 0.5238027162258756, + "grad_norm": 0.400426983833313, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0333, + "step": 18320 + }, + { + "epoch": 0.5240886347390994, + "grad_norm": 0.4996081590652466, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0381, + "step": 18330 + }, + { + "epoch": 0.5243745532523231, + "grad_norm": 0.5379085540771484, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0348, + "step": 18340 + }, + { + "epoch": 0.5246604717655469, + "grad_norm": 0.4462053179740906, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0307, + "step": 18350 + }, + { + "epoch": 0.5249463902787705, + "grad_norm": 0.7336096167564392, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0345, + "step": 18360 + }, + { + "epoch": 0.5252323087919942, + "grad_norm": 0.6676360368728638, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0346, + "step": 18370 + }, + { + "epoch": 0.525518227305218, + "grad_norm": 0.46608656644821167, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0334, + "step": 18380 + }, + { + "epoch": 0.5258041458184417, + "grad_norm": 0.4906940460205078, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0331, + "step": 18390 + }, + { + "epoch": 0.5260900643316655, + "grad_norm": 0.4200032353401184, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0394, + "step": 18400 + }, + { + "epoch": 0.5263759828448892, + "grad_norm": 0.5663877725601196, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0349, + "step": 18410 + }, + { + "epoch": 0.526661901358113, + "grad_norm": 0.36824384331703186, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0303, + "step": 18420 + }, + { + "epoch": 0.5269478198713367, + "grad_norm": 0.8120076060295105, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0443, + "step": 18430 + }, + { + "epoch": 0.5272337383845604, + "grad_norm": 0.4102472960948944, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0369, + "step": 18440 + }, + { + "epoch": 0.5275196568977841, + "grad_norm": 0.5186526775360107, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0338, + "step": 18450 + }, + { + "epoch": 0.5278055754110078, + "grad_norm": 0.9650108218193054, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0343, + "step": 18460 + }, + { + "epoch": 0.5280914939242316, + "grad_norm": 0.5894375443458557, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0416, + "step": 18470 + }, + { + "epoch": 0.5283774124374553, + "grad_norm": 0.6188816428184509, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0402, + "step": 18480 + }, + { + "epoch": 0.5286633309506791, + "grad_norm": 0.35280847549438477, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0363, + "step": 18490 + }, + { + "epoch": 0.5289492494639028, + "grad_norm": 0.7289313673973083, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0392, + "step": 18500 + }, + { + "epoch": 0.5292351679771266, + "grad_norm": 0.505050778388977, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0329, + "step": 18510 + }, + { + "epoch": 0.5295210864903502, + "grad_norm": 0.7029705047607422, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0344, + "step": 18520 + }, + { + "epoch": 0.529807005003574, + "grad_norm": 0.2958471477031708, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0431, + "step": 18530 + }, + { + "epoch": 0.5300929235167977, + "grad_norm": 0.9649683237075806, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0329, + "step": 18540 + }, + { + "epoch": 0.5303788420300214, + "grad_norm": 0.24733735620975494, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0354, + "step": 18550 + }, + { + "epoch": 0.5306647605432452, + "grad_norm": 0.44838136434555054, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0334, + "step": 18560 + }, + { + "epoch": 0.5309506790564689, + "grad_norm": 0.4505597949028015, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0338, + "step": 18570 + }, + { + "epoch": 0.5312365975696927, + "grad_norm": 0.44188442826271057, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0326, + "step": 18580 + }, + { + "epoch": 0.5315225160829163, + "grad_norm": 0.4539152979850769, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0369, + "step": 18590 + }, + { + "epoch": 0.5318084345961401, + "grad_norm": 0.8311023712158203, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0441, + "step": 18600 + }, + { + "epoch": 0.5320943531093638, + "grad_norm": 0.53764808177948, + "learning_rate": 8.025779439806006e-06, + "loss": 0.037, + "step": 18610 + }, + { + "epoch": 0.5323802716225876, + "grad_norm": 1.2192102670669556, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0369, + "step": 18620 + }, + { + "epoch": 0.5326661901358113, + "grad_norm": 0.5254611968994141, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0371, + "step": 18630 + }, + { + "epoch": 0.532952108649035, + "grad_norm": 0.585709810256958, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0337, + "step": 18640 + }, + { + "epoch": 0.5332380271622588, + "grad_norm": 0.45416259765625, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0425, + "step": 18650 + }, + { + "epoch": 0.5335239456754824, + "grad_norm": 0.3957739472389221, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0354, + "step": 18660 + }, + { + "epoch": 0.5338098641887062, + "grad_norm": 0.6211117506027222, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0347, + "step": 18670 + }, + { + "epoch": 0.5340957827019299, + "grad_norm": 0.49023327231407166, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0308, + "step": 18680 + }, + { + "epoch": 0.5343817012151537, + "grad_norm": 0.5823351144790649, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0351, + "step": 18690 + }, + { + "epoch": 0.5346676197283774, + "grad_norm": 0.6048677563667297, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0382, + "step": 18700 + }, + { + "epoch": 0.5349535382416012, + "grad_norm": 0.5293828845024109, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0329, + "step": 18710 + }, + { + "epoch": 0.5352394567548249, + "grad_norm": 0.5935509204864502, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0388, + "step": 18720 + }, + { + "epoch": 0.5355253752680486, + "grad_norm": 0.8369598388671875, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0421, + "step": 18730 + }, + { + "epoch": 0.5358112937812723, + "grad_norm": 0.6874870657920837, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0385, + "step": 18740 + }, + { + "epoch": 0.536097212294496, + "grad_norm": 0.43511492013931274, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0405, + "step": 18750 + }, + { + "epoch": 0.5363831308077198, + "grad_norm": 0.662755012512207, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0375, + "step": 18760 + }, + { + "epoch": 0.5366690493209435, + "grad_norm": 0.5519852638244629, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0351, + "step": 18770 + }, + { + "epoch": 0.5369549678341673, + "grad_norm": 0.9711637496948242, + "learning_rate": 7.869858673101027e-06, + "loss": 0.038, + "step": 18780 + }, + { + "epoch": 0.537240886347391, + "grad_norm": 0.4944411516189575, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0416, + "step": 18790 + }, + { + "epoch": 0.5375268048606148, + "grad_norm": 0.5257377624511719, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0349, + "step": 18800 + }, + { + "epoch": 0.5378127233738385, + "grad_norm": 0.4833063781261444, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0414, + "step": 18810 + }, + { + "epoch": 0.5380986418870621, + "grad_norm": 0.4496164917945862, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0369, + "step": 18820 + }, + { + "epoch": 0.5383845604002859, + "grad_norm": 0.6939138174057007, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0337, + "step": 18830 + }, + { + "epoch": 0.5386704789135096, + "grad_norm": 0.32579538226127625, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0371, + "step": 18840 + }, + { + "epoch": 0.5389563974267334, + "grad_norm": 0.35594654083251953, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0366, + "step": 18850 + }, + { + "epoch": 0.5392423159399571, + "grad_norm": 0.6114012002944946, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0372, + "step": 18860 + }, + { + "epoch": 0.5395282344531809, + "grad_norm": 0.8492457270622253, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0346, + "step": 18870 + }, + { + "epoch": 0.5398141529664046, + "grad_norm": 0.5214036703109741, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0388, + "step": 18880 + }, + { + "epoch": 0.5401000714796284, + "grad_norm": 0.428671658039093, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0394, + "step": 18890 + }, + { + "epoch": 0.540385989992852, + "grad_norm": 0.6071562767028809, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0371, + "step": 18900 + }, + { + "epoch": 0.5406719085060757, + "grad_norm": 0.41996505856513977, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0334, + "step": 18910 + }, + { + "epoch": 0.5409578270192995, + "grad_norm": 0.5260844826698303, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0362, + "step": 18920 + }, + { + "epoch": 0.5412437455325232, + "grad_norm": 0.43362122774124146, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0325, + "step": 18930 + }, + { + "epoch": 0.541529664045747, + "grad_norm": 0.4597149193286896, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0345, + "step": 18940 + }, + { + "epoch": 0.5418155825589707, + "grad_norm": 0.6667322516441345, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0421, + "step": 18950 + }, + { + "epoch": 0.5421015010721945, + "grad_norm": 0.8998900651931763, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0368, + "step": 18960 + }, + { + "epoch": 0.5423874195854181, + "grad_norm": 0.5075538158416748, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0327, + "step": 18970 + }, + { + "epoch": 0.5426733380986419, + "grad_norm": 0.38445526361465454, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0446, + "step": 18980 + }, + { + "epoch": 0.5429592566118656, + "grad_norm": 0.696186363697052, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0364, + "step": 18990 + }, + { + "epoch": 0.5432451751250893, + "grad_norm": 0.6371187567710876, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0401, + "step": 19000 + }, + { + "epoch": 0.5435310936383131, + "grad_norm": 0.6122881174087524, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0345, + "step": 19010 + }, + { + "epoch": 0.5438170121515368, + "grad_norm": 0.4222267270088196, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0456, + "step": 19020 + }, + { + "epoch": 0.5441029306647606, + "grad_norm": 0.6122517585754395, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0434, + "step": 19030 + }, + { + "epoch": 0.5443888491779842, + "grad_norm": 0.2783992886543274, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0354, + "step": 19040 + }, + { + "epoch": 0.544674767691208, + "grad_norm": 0.6433000564575195, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0321, + "step": 19050 + }, + { + "epoch": 0.5449606862044317, + "grad_norm": 0.6967030167579651, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0394, + "step": 19060 + }, + { + "epoch": 0.5452466047176555, + "grad_norm": 0.4799044132232666, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0329, + "step": 19070 + }, + { + "epoch": 0.5455325232308792, + "grad_norm": 0.633895993232727, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0316, + "step": 19080 + }, + { + "epoch": 0.5458184417441029, + "grad_norm": 0.5601945519447327, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0449, + "step": 19090 + }, + { + "epoch": 0.5461043602573267, + "grad_norm": 0.4917007088661194, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0351, + "step": 19100 + }, + { + "epoch": 0.5463902787705504, + "grad_norm": 0.4813363254070282, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.029, + "step": 19110 + }, + { + "epoch": 0.5466761972837741, + "grad_norm": 0.5359676480293274, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0346, + "step": 19120 + }, + { + "epoch": 0.5469621157969978, + "grad_norm": 0.6500958204269409, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0374, + "step": 19130 + }, + { + "epoch": 0.5472480343102216, + "grad_norm": 0.7708510756492615, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0332, + "step": 19140 + }, + { + "epoch": 0.5475339528234453, + "grad_norm": 0.45693230628967285, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0344, + "step": 19150 + }, + { + "epoch": 0.5478198713366691, + "grad_norm": 0.6046226620674133, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0342, + "step": 19160 + }, + { + "epoch": 0.5481057898498928, + "grad_norm": 0.5253175497055054, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0449, + "step": 19170 + }, + { + "epoch": 0.5483917083631165, + "grad_norm": 0.3790060877799988, + "learning_rate": 7.507267205473318e-06, + "loss": 0.037, + "step": 19180 + }, + { + "epoch": 0.5486776268763403, + "grad_norm": 0.37709203362464905, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0346, + "step": 19190 + }, + { + "epoch": 0.5489635453895639, + "grad_norm": 0.3940931558609009, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0427, + "step": 19200 + }, + { + "epoch": 0.5492494639027877, + "grad_norm": 0.761299192905426, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0353, + "step": 19210 + }, + { + "epoch": 0.5495353824160114, + "grad_norm": 0.5268495082855225, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0328, + "step": 19220 + }, + { + "epoch": 0.5498213009292352, + "grad_norm": 0.45624151825904846, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0353, + "step": 19230 + }, + { + "epoch": 0.5501072194424589, + "grad_norm": 0.5374972224235535, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0345, + "step": 19240 + }, + { + "epoch": 0.5503931379556827, + "grad_norm": 0.49830907583236694, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0328, + "step": 19250 + }, + { + "epoch": 0.5506790564689064, + "grad_norm": 0.6223296523094177, + "learning_rate": 7.435514206212475e-06, + "loss": 0.037, + "step": 19260 + }, + { + "epoch": 0.55096497498213, + "grad_norm": 0.42801398038864136, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0371, + "step": 19270 + }, + { + "epoch": 0.5512508934953538, + "grad_norm": 0.3872825801372528, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0314, + "step": 19280 + }, + { + "epoch": 0.5515368120085775, + "grad_norm": 0.3967494070529938, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0349, + "step": 19290 + }, + { + "epoch": 0.5518227305218013, + "grad_norm": 0.42383769154548645, + "learning_rate": 7.399737764864619e-06, + "loss": 0.045, + "step": 19300 + }, + { + "epoch": 0.552108649035025, + "grad_norm": 0.48501884937286377, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0373, + "step": 19310 + }, + { + "epoch": 0.5523945675482488, + "grad_norm": 0.3783693015575409, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0334, + "step": 19320 + }, + { + "epoch": 0.5526804860614725, + "grad_norm": 0.5733019709587097, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0369, + "step": 19330 + }, + { + "epoch": 0.5529664045746963, + "grad_norm": 0.5022825002670288, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0375, + "step": 19340 + }, + { + "epoch": 0.5532523230879199, + "grad_norm": 0.5508015155792236, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0415, + "step": 19350 + }, + { + "epoch": 0.5535382416011436, + "grad_norm": 0.5692425966262817, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0401, + "step": 19360 + }, + { + "epoch": 0.5538241601143674, + "grad_norm": 0.7247840762138367, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0373, + "step": 19370 + }, + { + "epoch": 0.5541100786275911, + "grad_norm": 0.633986234664917, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0435, + "step": 19380 + }, + { + "epoch": 0.5543959971408149, + "grad_norm": 0.8598711490631104, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0424, + "step": 19390 + }, + { + "epoch": 0.5546819156540386, + "grad_norm": 0.782328188419342, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0424, + "step": 19400 + }, + { + "epoch": 0.5549678341672624, + "grad_norm": 0.48890456557273865, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0351, + "step": 19410 + }, + { + "epoch": 0.555253752680486, + "grad_norm": 0.4759981036186218, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0395, + "step": 19420 + }, + { + "epoch": 0.5555396711937098, + "grad_norm": 0.6431323885917664, + "learning_rate": 7.283934675167239e-06, + "loss": 0.036, + "step": 19430 + }, + { + "epoch": 0.5558255897069335, + "grad_norm": 0.6633809208869934, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0333, + "step": 19440 + }, + { + "epoch": 0.5561115082201572, + "grad_norm": 0.3405994772911072, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0375, + "step": 19450 + }, + { + "epoch": 0.556397426733381, + "grad_norm": 0.3443987965583801, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0329, + "step": 19460 + }, + { + "epoch": 0.5566833452466047, + "grad_norm": 0.7973398566246033, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0412, + "step": 19470 + }, + { + "epoch": 0.5569692637598285, + "grad_norm": 0.43843239545822144, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0302, + "step": 19480 + }, + { + "epoch": 0.5572551822730522, + "grad_norm": 0.6797782182693481, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0401, + "step": 19490 + }, + { + "epoch": 0.557541100786276, + "grad_norm": 0.5020610690116882, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0329, + "step": 19500 + }, + { + "epoch": 0.5578270192994996, + "grad_norm": 0.5093050003051758, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0381, + "step": 19510 + }, + { + "epoch": 0.5581129378127234, + "grad_norm": 0.6136947870254517, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0373, + "step": 19520 + }, + { + "epoch": 0.5583988563259471, + "grad_norm": 0.4213317930698395, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0338, + "step": 19530 + }, + { + "epoch": 0.5586847748391708, + "grad_norm": 0.6560636162757874, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0332, + "step": 19540 + }, + { + "epoch": 0.5589706933523946, + "grad_norm": 0.41303765773773193, + "learning_rate": 7.177693135871202e-06, + "loss": 0.03, + "step": 19550 + }, + { + "epoch": 0.5592566118656183, + "grad_norm": 0.5260538458824158, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0328, + "step": 19560 + }, + { + "epoch": 0.559542530378842, + "grad_norm": 0.6076327562332153, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0428, + "step": 19570 + }, + { + "epoch": 0.5598284488920657, + "grad_norm": 0.635111927986145, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0318, + "step": 19580 + }, + { + "epoch": 0.5601143674052895, + "grad_norm": 0.7933056354522705, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0357, + "step": 19590 + }, + { + "epoch": 0.5604002859185132, + "grad_norm": 0.44312241673469543, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0289, + "step": 19600 + }, + { + "epoch": 0.560686204431737, + "grad_norm": 0.36346134543418884, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0354, + "step": 19610 + }, + { + "epoch": 0.5609721229449607, + "grad_norm": 0.49605289101600647, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0367, + "step": 19620 + }, + { + "epoch": 0.5612580414581844, + "grad_norm": 0.7115452289581299, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0354, + "step": 19630 + }, + { + "epoch": 0.5615439599714082, + "grad_norm": 0.650925874710083, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0353, + "step": 19640 + }, + { + "epoch": 0.5618298784846318, + "grad_norm": 0.5046663880348206, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0294, + "step": 19650 + }, + { + "epoch": 0.5621157969978556, + "grad_norm": 0.4441855549812317, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0326, + "step": 19660 + }, + { + "epoch": 0.5624017155110793, + "grad_norm": 0.3956650495529175, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0446, + "step": 19670 + }, + { + "epoch": 0.5626876340243031, + "grad_norm": 0.5384211540222168, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0331, + "step": 19680 + }, + { + "epoch": 0.5629735525375268, + "grad_norm": 0.6183366775512695, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0324, + "step": 19690 + }, + { + "epoch": 0.5632594710507506, + "grad_norm": 0.9116242527961731, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0341, + "step": 19700 + }, + { + "epoch": 0.5635453895639743, + "grad_norm": 0.8171015381813049, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0306, + "step": 19710 + }, + { + "epoch": 0.563831308077198, + "grad_norm": 0.42670243978500366, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0336, + "step": 19720 + }, + { + "epoch": 0.5641172265904217, + "grad_norm": 0.7338811159133911, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0363, + "step": 19730 + }, + { + "epoch": 0.5644031451036454, + "grad_norm": 0.5576338171958923, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0371, + "step": 19740 + }, + { + "epoch": 0.5646890636168692, + "grad_norm": 0.7390629649162292, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0382, + "step": 19750 + }, + { + "epoch": 0.5649749821300929, + "grad_norm": 0.801812469959259, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0379, + "step": 19760 + }, + { + "epoch": 0.5652609006433167, + "grad_norm": 0.5697385668754578, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0369, + "step": 19770 + }, + { + "epoch": 0.5655468191565404, + "grad_norm": 0.4180932343006134, + "learning_rate": 6.975884226362e-06, + "loss": 0.039, + "step": 19780 + }, + { + "epoch": 0.5658327376697642, + "grad_norm": 0.648389995098114, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0346, + "step": 19790 + }, + { + "epoch": 0.5661186561829878, + "grad_norm": 0.9673929214477539, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0392, + "step": 19800 + }, + { + "epoch": 0.5664045746962115, + "grad_norm": 0.4793975353240967, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0321, + "step": 19810 + }, + { + "epoch": 0.5666904932094353, + "grad_norm": 0.5206098556518555, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0319, + "step": 19820 + }, + { + "epoch": 0.566976411722659, + "grad_norm": 0.39929306507110596, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0335, + "step": 19830 + }, + { + "epoch": 0.5672623302358828, + "grad_norm": 0.6819440722465515, + "learning_rate": 6.923644220932124e-06, + "loss": 0.0338, + "step": 19840 + }, + { + "epoch": 0.5675482487491065, + "grad_norm": 0.7612042427062988, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0345, + "step": 19850 + }, + { + "epoch": 0.5678341672623303, + "grad_norm": 0.472676545381546, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0388, + "step": 19860 + }, + { + "epoch": 0.568120085775554, + "grad_norm": 0.48102107644081116, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0304, + "step": 19870 + }, + { + "epoch": 0.5684060042887777, + "grad_norm": 0.4174644649028778, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0315, + "step": 19880 + }, + { + "epoch": 0.5686919228020014, + "grad_norm": 0.4218151271343231, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0413, + "step": 19890 + }, + { + "epoch": 0.5689778413152251, + "grad_norm": 0.8243978023529053, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0399, + "step": 19900 + }, + { + "epoch": 0.5692637598284489, + "grad_norm": 0.400924414396286, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0298, + "step": 19910 + }, + { + "epoch": 0.5695496783416726, + "grad_norm": 0.5199277400970459, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0351, + "step": 19920 + }, + { + "epoch": 0.5698355968548964, + "grad_norm": 0.5238781571388245, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0374, + "step": 19930 + }, + { + "epoch": 0.5701215153681201, + "grad_norm": 0.7451756596565247, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0378, + "step": 19940 + }, + { + "epoch": 0.5704074338813439, + "grad_norm": 0.5029926300048828, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0391, + "step": 19950 + }, + { + "epoch": 0.5706933523945675, + "grad_norm": 0.5532147884368896, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0383, + "step": 19960 + }, + { + "epoch": 0.5709792709077913, + "grad_norm": 0.5694131851196289, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0364, + "step": 19970 + }, + { + "epoch": 0.571265189421015, + "grad_norm": 0.5066515803337097, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0363, + "step": 19980 + }, + { + "epoch": 0.5715511079342387, + "grad_norm": 0.5676470398902893, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0327, + "step": 19990 + }, + { + "epoch": 0.5718370264474625, + "grad_norm": 0.37414318323135376, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0395, + "step": 20000 + }, + { + "epoch": 0.5721229449606862, + "grad_norm": 0.5888793468475342, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0372, + "step": 20010 + }, + { + "epoch": 0.57240886347391, + "grad_norm": 0.6593262553215027, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0329, + "step": 20020 + }, + { + "epoch": 0.5726947819871336, + "grad_norm": 0.6382879614830017, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0286, + "step": 20030 + }, + { + "epoch": 0.5729807005003574, + "grad_norm": 0.6364927887916565, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0383, + "step": 20040 + }, + { + "epoch": 0.5732666190135811, + "grad_norm": 0.4102194011211395, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0342, + "step": 20050 + }, + { + "epoch": 0.5735525375268049, + "grad_norm": 0.6449235081672668, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0315, + "step": 20060 + }, + { + "epoch": 0.5738384560400286, + "grad_norm": 0.708431601524353, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0316, + "step": 20070 + }, + { + "epoch": 0.5741243745532523, + "grad_norm": 0.46444272994995117, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0352, + "step": 20080 + }, + { + "epoch": 0.5744102930664761, + "grad_norm": 0.7026715278625488, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0337, + "step": 20090 + }, + { + "epoch": 0.5746962115796997, + "grad_norm": 0.43397894501686096, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0303, + "step": 20100 + }, + { + "epoch": 0.5749821300929235, + "grad_norm": 0.4937734305858612, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0403, + "step": 20110 + }, + { + "epoch": 0.5752680486061472, + "grad_norm": 0.5981410145759583, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0375, + "step": 20120 + }, + { + "epoch": 0.575553967119371, + "grad_norm": 0.5616198778152466, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0314, + "step": 20130 + }, + { + "epoch": 0.5758398856325947, + "grad_norm": 0.35028502345085144, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0367, + "step": 20140 + }, + { + "epoch": 0.5761258041458185, + "grad_norm": 0.3556109666824341, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0308, + "step": 20150 + }, + { + "epoch": 0.5764117226590422, + "grad_norm": 0.579409658908844, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0344, + "step": 20160 + }, + { + "epoch": 0.5766976411722659, + "grad_norm": 0.4484683573246002, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0312, + "step": 20170 + }, + { + "epoch": 0.5769835596854896, + "grad_norm": 0.3636038899421692, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0337, + "step": 20180 + }, + { + "epoch": 0.5772694781987133, + "grad_norm": 0.6667287349700928, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0343, + "step": 20190 + }, + { + "epoch": 0.5775553967119371, + "grad_norm": 0.26031574606895447, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0303, + "step": 20200 + }, + { + "epoch": 0.5778413152251608, + "grad_norm": 0.6683355569839478, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0316, + "step": 20210 + }, + { + "epoch": 0.5781272337383846, + "grad_norm": 0.4097786843776703, + "learning_rate": 6.596880604028027e-06, + "loss": 0.0346, + "step": 20220 + }, + { + "epoch": 0.5784131522516083, + "grad_norm": 0.45405757427215576, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0345, + "step": 20230 + }, + { + "epoch": 0.5786990707648321, + "grad_norm": 0.28291839361190796, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0323, + "step": 20240 + }, + { + "epoch": 0.5789849892780558, + "grad_norm": 0.5656186938285828, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0299, + "step": 20250 + }, + { + "epoch": 0.5792709077912794, + "grad_norm": 0.6780310869216919, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0309, + "step": 20260 + }, + { + "epoch": 0.5795568263045032, + "grad_norm": 0.3968813121318817, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0347, + "step": 20270 + }, + { + "epoch": 0.5798427448177269, + "grad_norm": 0.6598440408706665, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0329, + "step": 20280 + }, + { + "epoch": 0.5801286633309507, + "grad_norm": 0.4988970458507538, + "learning_rate": 6.53748481975927e-06, + "loss": 0.038, + "step": 20290 + }, + { + "epoch": 0.5804145818441744, + "grad_norm": 0.8016706705093384, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0358, + "step": 20300 + }, + { + "epoch": 0.5807005003573982, + "grad_norm": 0.8367684483528137, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0354, + "step": 20310 + }, + { + "epoch": 0.5809864188706219, + "grad_norm": 0.5730129480361938, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0421, + "step": 20320 + }, + { + "epoch": 0.5812723373838456, + "grad_norm": 0.43631577491760254, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0361, + "step": 20330 + }, + { + "epoch": 0.5815582558970693, + "grad_norm": 0.7001264691352844, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0355, + "step": 20340 + }, + { + "epoch": 0.581844174410293, + "grad_norm": 0.4988951086997986, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0387, + "step": 20350 + }, + { + "epoch": 0.5821300929235168, + "grad_norm": 0.45731016993522644, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0398, + "step": 20360 + }, + { + "epoch": 0.5824160114367405, + "grad_norm": 0.38684406876564026, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0345, + "step": 20370 + }, + { + "epoch": 0.5827019299499643, + "grad_norm": 0.3924580514431, + "learning_rate": 6.461496350649529e-06, + "loss": 0.037, + "step": 20380 + }, + { + "epoch": 0.582987848463188, + "grad_norm": 0.43735265731811523, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0371, + "step": 20390 + }, + { + "epoch": 0.5832737669764118, + "grad_norm": 0.4595138430595398, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0337, + "step": 20400 + }, + { + "epoch": 0.5835596854896354, + "grad_norm": 0.429569810628891, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0284, + "step": 20410 + }, + { + "epoch": 0.5838456040028592, + "grad_norm": 0.5399166345596313, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0353, + "step": 20420 + }, + { + "epoch": 0.5841315225160829, + "grad_norm": 0.5698734521865845, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0361, + "step": 20430 + }, + { + "epoch": 0.5844174410293066, + "grad_norm": 0.35422587394714355, + "learning_rate": 6.411076603575166e-06, + "loss": 0.033, + "step": 20440 + }, + { + "epoch": 0.5847033595425304, + "grad_norm": 0.4475875198841095, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0344, + "step": 20450 + }, + { + "epoch": 0.5849892780557541, + "grad_norm": 0.4950159192085266, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0428, + "step": 20460 + }, + { + "epoch": 0.5852751965689779, + "grad_norm": 0.695249617099762, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0354, + "step": 20470 + }, + { + "epoch": 0.5855611150822015, + "grad_norm": 0.2538593113422394, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0383, + "step": 20480 + }, + { + "epoch": 0.5858470335954253, + "grad_norm": 0.6770910024642944, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0364, + "step": 20490 + }, + { + "epoch": 0.586132952108649, + "grad_norm": 0.7187057733535767, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0319, + "step": 20500 + }, + { + "epoch": 0.5864188706218728, + "grad_norm": 0.34853193163871765, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.033, + "step": 20510 + }, + { + "epoch": 0.5867047891350965, + "grad_norm": 0.8484768271446228, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0331, + "step": 20520 + }, + { + "epoch": 0.5869907076483202, + "grad_norm": 0.6645244359970093, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0353, + "step": 20530 + }, + { + "epoch": 0.587276626161544, + "grad_norm": 0.5094996690750122, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0374, + "step": 20540 + }, + { + "epoch": 0.5875625446747677, + "grad_norm": 0.5012859106063843, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0329, + "step": 20550 + }, + { + "epoch": 0.5878484631879914, + "grad_norm": 0.6465861797332764, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0282, + "step": 20560 + }, + { + "epoch": 0.5881343817012151, + "grad_norm": 0.5694834589958191, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0313, + "step": 20570 + }, + { + "epoch": 0.5884203002144389, + "grad_norm": 0.4945555627346039, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0353, + "step": 20580 + }, + { + "epoch": 0.5887062187276626, + "grad_norm": 0.5606586933135986, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0343, + "step": 20590 + }, + { + "epoch": 0.5889921372408864, + "grad_norm": 0.6913802027702332, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0358, + "step": 20600 + }, + { + "epoch": 0.5892780557541101, + "grad_norm": 0.8119901418685913, + "learning_rate": 6.269280523549298e-06, + "loss": 0.038, + "step": 20610 + }, + { + "epoch": 0.5895639742673338, + "grad_norm": 0.5558752417564392, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0311, + "step": 20620 + }, + { + "epoch": 0.5898498927805575, + "grad_norm": 0.45028987526893616, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0321, + "step": 20630 + }, + { + "epoch": 0.5901358112937812, + "grad_norm": 0.3697125017642975, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0331, + "step": 20640 + }, + { + "epoch": 0.590421729807005, + "grad_norm": 0.5406038761138916, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0445, + "step": 20650 + }, + { + "epoch": 0.5907076483202287, + "grad_norm": 0.4301048219203949, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0371, + "step": 20660 + }, + { + "epoch": 0.5909935668334525, + "grad_norm": 0.6343403458595276, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0353, + "step": 20670 + }, + { + "epoch": 0.5912794853466762, + "grad_norm": 0.4666310250759125, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0352, + "step": 20680 + }, + { + "epoch": 0.5915654038599, + "grad_norm": 0.7471063733100891, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0352, + "step": 20690 + }, + { + "epoch": 0.5918513223731237, + "grad_norm": 0.9971692562103271, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0327, + "step": 20700 + }, + { + "epoch": 0.5921372408863473, + "grad_norm": 0.5646237134933472, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0365, + "step": 20710 + }, + { + "epoch": 0.5924231593995711, + "grad_norm": 0.46781328320503235, + "learning_rate": 6.17838207381795e-06, + "loss": 0.042, + "step": 20720 + }, + { + "epoch": 0.5927090779127948, + "grad_norm": 0.7061547040939331, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0484, + "step": 20730 + }, + { + "epoch": 0.5929949964260186, + "grad_norm": 0.6651175618171692, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0353, + "step": 20740 + }, + { + "epoch": 0.5932809149392423, + "grad_norm": 0.5959596037864685, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0344, + "step": 20750 + }, + { + "epoch": 0.5935668334524661, + "grad_norm": 0.5869056582450867, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0389, + "step": 20760 + }, + { + "epoch": 0.5938527519656898, + "grad_norm": 0.42101356387138367, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0288, + "step": 20770 + }, + { + "epoch": 0.5941386704789136, + "grad_norm": 0.6310023069381714, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0362, + "step": 20780 + }, + { + "epoch": 0.5944245889921372, + "grad_norm": 0.6737013459205627, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0377, + "step": 20790 + }, + { + "epoch": 0.5947105075053609, + "grad_norm": 0.6716046333312988, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0415, + "step": 20800 + }, + { + "epoch": 0.5949964260185847, + "grad_norm": 0.9742669463157654, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0337, + "step": 20810 + }, + { + "epoch": 0.5952823445318084, + "grad_norm": 0.571782648563385, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0362, + "step": 20820 + }, + { + "epoch": 0.5955682630450322, + "grad_norm": 0.9673911333084106, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0362, + "step": 20830 + }, + { + "epoch": 0.5958541815582559, + "grad_norm": 0.5391695499420166, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0331, + "step": 20840 + }, + { + "epoch": 0.5961401000714797, + "grad_norm": 1.4766349792480469, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0332, + "step": 20850 + }, + { + "epoch": 0.5964260185847033, + "grad_norm": 0.6329004168510437, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0375, + "step": 20860 + }, + { + "epoch": 0.5967119370979271, + "grad_norm": 0.6745501160621643, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0347, + "step": 20870 + }, + { + "epoch": 0.5969978556111508, + "grad_norm": 0.3006536364555359, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0321, + "step": 20880 + }, + { + "epoch": 0.5972837741243745, + "grad_norm": 0.4666125476360321, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0363, + "step": 20890 + }, + { + "epoch": 0.5975696926375983, + "grad_norm": 0.3881456255912781, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0318, + "step": 20900 + }, + { + "epoch": 0.597855611150822, + "grad_norm": 0.4211449921131134, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0357, + "step": 20910 + }, + { + "epoch": 0.5981415296640458, + "grad_norm": 1.125683307647705, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0364, + "step": 20920 + }, + { + "epoch": 0.5984274481772694, + "grad_norm": 0.9670853614807129, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0385, + "step": 20930 + }, + { + "epoch": 0.5987133666904932, + "grad_norm": 0.7302138209342957, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0321, + "step": 20940 + }, + { + "epoch": 0.5989992852037169, + "grad_norm": 0.7883613109588623, + "learning_rate": 5.990549152010853e-06, + "loss": 0.038, + "step": 20950 + }, + { + "epoch": 0.5992852037169407, + "grad_norm": 0.44051188230514526, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0423, + "step": 20960 + }, + { + "epoch": 0.5995711222301644, + "grad_norm": 0.5225116014480591, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0293, + "step": 20970 + }, + { + "epoch": 0.5998570407433881, + "grad_norm": 0.44672495126724243, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0314, + "step": 20980 + }, + { + "epoch": 0.6001429592566119, + "grad_norm": 0.4489240050315857, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0298, + "step": 20990 + }, + { + "epoch": 0.6004288777698356, + "grad_norm": 0.3942757844924927, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0323, + "step": 21000 + }, + { + "epoch": 0.6007147962830593, + "grad_norm": 0.5079668760299683, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0435, + "step": 21010 + }, + { + "epoch": 0.601000714796283, + "grad_norm": 0.5057359933853149, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0364, + "step": 21020 + }, + { + "epoch": 0.6012866333095068, + "grad_norm": 0.4823545515537262, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0408, + "step": 21030 + }, + { + "epoch": 0.6015725518227305, + "grad_norm": 0.42647498846054077, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0366, + "step": 21040 + }, + { + "epoch": 0.6018584703359543, + "grad_norm": 0.5967830419540405, + "learning_rate": 5.909845843697164e-06, + "loss": 0.037, + "step": 21050 + }, + { + "epoch": 0.602144388849178, + "grad_norm": 0.4567292034626007, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0306, + "step": 21060 + }, + { + "epoch": 0.6024303073624017, + "grad_norm": 0.6767273545265198, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0447, + "step": 21070 + }, + { + "epoch": 0.6027162258756255, + "grad_norm": 0.2957002520561218, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0339, + "step": 21080 + }, + { + "epoch": 0.6030021443888491, + "grad_norm": 0.6870969533920288, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0313, + "step": 21090 + }, + { + "epoch": 0.6032880629020729, + "grad_norm": 0.530910313129425, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0377, + "step": 21100 + }, + { + "epoch": 0.6035739814152966, + "grad_norm": 0.21370625495910645, + "learning_rate": 5.86170998451151e-06, + "loss": 0.032, + "step": 21110 + }, + { + "epoch": 0.6038598999285204, + "grad_norm": 0.6039503812789917, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0258, + "step": 21120 + }, + { + "epoch": 0.6041458184417441, + "grad_norm": 0.5375682711601257, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0341, + "step": 21130 + }, + { + "epoch": 0.6044317369549679, + "grad_norm": 0.4819096326828003, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0309, + "step": 21140 + }, + { + "epoch": 0.6047176554681916, + "grad_norm": 0.31165415048599243, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0278, + "step": 21150 + }, + { + "epoch": 0.6050035739814152, + "grad_norm": 0.2781001925468445, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0342, + "step": 21160 + }, + { + "epoch": 0.605289492494639, + "grad_norm": 0.44726037979125977, + "learning_rate": 5.813791207086085e-06, + "loss": 0.032, + "step": 21170 + }, + { + "epoch": 0.6055754110078627, + "grad_norm": 0.5762766599655151, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0325, + "step": 21180 + }, + { + "epoch": 0.6058613295210865, + "grad_norm": 0.49829939007759094, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0322, + "step": 21190 + }, + { + "epoch": 0.6061472480343102, + "grad_norm": 0.4683297276496887, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0369, + "step": 21200 + }, + { + "epoch": 0.606433166547534, + "grad_norm": 0.662159264087677, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0278, + "step": 21210 + }, + { + "epoch": 0.6067190850607577, + "grad_norm": 0.4397001564502716, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0366, + "step": 21220 + }, + { + "epoch": 0.6070050035739815, + "grad_norm": 0.4977007508277893, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0293, + "step": 21230 + }, + { + "epoch": 0.6072909220872051, + "grad_norm": 0.3705490827560425, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0315, + "step": 21240 + }, + { + "epoch": 0.6075768406004288, + "grad_norm": 0.6350240111351013, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0286, + "step": 21250 + }, + { + "epoch": 0.6078627591136526, + "grad_norm": 0.5590423941612244, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0373, + "step": 21260 + }, + { + "epoch": 0.6081486776268763, + "grad_norm": 0.5244049429893494, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0325, + "step": 21270 + }, + { + "epoch": 0.6084345961401001, + "grad_norm": 1.082044005393982, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0373, + "step": 21280 + }, + { + "epoch": 0.6087205146533238, + "grad_norm": 0.614028811454773, + "learning_rate": 5.71861298612245e-06, + "loss": 0.031, + "step": 21290 + }, + { + "epoch": 0.6090064331665476, + "grad_norm": 0.783205509185791, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0289, + "step": 21300 + }, + { + "epoch": 0.6092923516797712, + "grad_norm": 0.5420807600021362, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.031, + "step": 21310 + }, + { + "epoch": 0.609578270192995, + "grad_norm": 0.42979222536087036, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0291, + "step": 21320 + }, + { + "epoch": 0.6098641887062187, + "grad_norm": 0.44511356949806213, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.031, + "step": 21330 + }, + { + "epoch": 0.6101501072194424, + "grad_norm": 0.528799831867218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0269, + "step": 21340 + }, + { + "epoch": 0.6104360257326662, + "grad_norm": 0.43274471163749695, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0438, + "step": 21350 + }, + { + "epoch": 0.6107219442458899, + "grad_norm": 0.8020172715187073, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0393, + "step": 21360 + }, + { + "epoch": 0.6110078627591137, + "grad_norm": 0.4354296028614044, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0338, + "step": 21370 + }, + { + "epoch": 0.6112937812723374, + "grad_norm": 0.587364673614502, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0359, + "step": 21380 + }, + { + "epoch": 0.6115796997855611, + "grad_norm": 0.5426310300827026, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0333, + "step": 21390 + }, + { + "epoch": 0.6118656182987848, + "grad_norm": 0.5900459289550781, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0344, + "step": 21400 + }, + { + "epoch": 0.6121515368120086, + "grad_norm": 0.5652357935905457, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0396, + "step": 21410 + }, + { + "epoch": 0.6124374553252323, + "grad_norm": 0.5287114977836609, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0387, + "step": 21420 + }, + { + "epoch": 0.612723373838456, + "grad_norm": 0.7939184904098511, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0351, + "step": 21430 + }, + { + "epoch": 0.6130092923516798, + "grad_norm": 0.6840642094612122, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0349, + "step": 21440 + }, + { + "epoch": 0.6132952108649035, + "grad_norm": 0.3717428147792816, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0336, + "step": 21450 + }, + { + "epoch": 0.6135811293781273, + "grad_norm": 0.5073713064193726, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0326, + "step": 21460 + }, + { + "epoch": 0.6138670478913509, + "grad_norm": 1.1579232215881348, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0388, + "step": 21470 + }, + { + "epoch": 0.6141529664045747, + "grad_norm": 0.4209369122982025, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0307, + "step": 21480 + }, + { + "epoch": 0.6144388849177984, + "grad_norm": 0.38663822412490845, + "learning_rate": 5.561973825289734e-06, + "loss": 0.037, + "step": 21490 + }, + { + "epoch": 0.6147248034310222, + "grad_norm": 0.538270890712738, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0333, + "step": 21500 + }, + { + "epoch": 0.6150107219442459, + "grad_norm": 0.28280535340309143, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0319, + "step": 21510 + }, + { + "epoch": 0.6152966404574696, + "grad_norm": 0.5407803058624268, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0352, + "step": 21520 + }, + { + "epoch": 0.6155825589706934, + "grad_norm": 1.4600974321365356, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0409, + "step": 21530 + }, + { + "epoch": 0.615868477483917, + "grad_norm": 0.659900426864624, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0322, + "step": 21540 + }, + { + "epoch": 0.6161543959971408, + "grad_norm": 0.6401934623718262, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0339, + "step": 21550 + }, + { + "epoch": 0.6164403145103645, + "grad_norm": 0.6409866213798523, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0331, + "step": 21560 + }, + { + "epoch": 0.6167262330235883, + "grad_norm": 0.6627630591392517, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0332, + "step": 21570 + }, + { + "epoch": 0.617012151536812, + "grad_norm": 0.6180721521377563, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0327, + "step": 21580 + }, + { + "epoch": 0.6172980700500358, + "grad_norm": 0.4689866006374359, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0276, + "step": 21590 + }, + { + "epoch": 0.6175839885632595, + "grad_norm": 0.5039265751838684, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0427, + "step": 21600 + }, + { + "epoch": 0.6178699070764831, + "grad_norm": 0.5313833355903625, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0351, + "step": 21610 + }, + { + "epoch": 0.6181558255897069, + "grad_norm": 0.4919044077396393, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0327, + "step": 21620 + }, + { + "epoch": 0.6184417441029306, + "grad_norm": 0.5446444153785706, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0331, + "step": 21630 + }, + { + "epoch": 0.6187276626161544, + "grad_norm": 0.5198109745979309, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.032, + "step": 21640 + }, + { + "epoch": 0.6190135811293781, + "grad_norm": 0.5684625506401062, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0339, + "step": 21650 + }, + { + "epoch": 0.6192994996426019, + "grad_norm": 0.6882810592651367, + "learning_rate": 5.430834687545416e-06, + "loss": 0.035, + "step": 21660 + }, + { + "epoch": 0.6195854181558256, + "grad_norm": 0.7360101938247681, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0306, + "step": 21670 + }, + { + "epoch": 0.6198713366690494, + "grad_norm": 0.5557180047035217, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0241, + "step": 21680 + }, + { + "epoch": 0.620157255182273, + "grad_norm": 0.4302096962928772, + "learning_rate": 5.407887295494495e-06, + "loss": 0.035, + "step": 21690 + }, + { + "epoch": 0.6204431736954967, + "grad_norm": 0.4740016460418701, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0331, + "step": 21700 + }, + { + "epoch": 0.6207290922087205, + "grad_norm": 0.5400598049163818, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0297, + "step": 21710 + }, + { + "epoch": 0.6210150107219442, + "grad_norm": 0.4270641803741455, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0334, + "step": 21720 + }, + { + "epoch": 0.621300929235168, + "grad_norm": 0.41063550114631653, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0375, + "step": 21730 + }, + { + "epoch": 0.6215868477483917, + "grad_norm": 0.48556044697761536, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0291, + "step": 21740 + }, + { + "epoch": 0.6218727662616155, + "grad_norm": 0.2872731387615204, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0323, + "step": 21750 + }, + { + "epoch": 0.6221586847748392, + "grad_norm": 0.4088454246520996, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0307, + "step": 21760 + }, + { + "epoch": 0.622444603288063, + "grad_norm": 0.42600440979003906, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0326, + "step": 21770 + }, + { + "epoch": 0.6227305218012866, + "grad_norm": 0.36466315388679504, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0337, + "step": 21780 + }, + { + "epoch": 0.6230164403145103, + "grad_norm": 0.588921308517456, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0336, + "step": 21790 + }, + { + "epoch": 0.6233023588277341, + "grad_norm": 0.44768571853637695, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0326, + "step": 21800 + }, + { + "epoch": 0.6235882773409578, + "grad_norm": 1.1612637042999268, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0355, + "step": 21810 + }, + { + "epoch": 0.6238741958541816, + "grad_norm": 1.0912114381790161, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0436, + "step": 21820 + }, + { + "epoch": 0.6241601143674053, + "grad_norm": 0.5813164710998535, + "learning_rate": 5.301584321328435e-06, + "loss": 0.034, + "step": 21830 + }, + { + "epoch": 0.624446032880629, + "grad_norm": 0.45064911246299744, + "learning_rate": 5.294041118587667e-06, + "loss": 0.032, + "step": 21840 + }, + { + "epoch": 0.6247319513938527, + "grad_norm": 0.5173943638801575, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0322, + "step": 21850 + }, + { + "epoch": 0.6250178699070765, + "grad_norm": 0.41157352924346924, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0319, + "step": 21860 + }, + { + "epoch": 0.6253037884203002, + "grad_norm": 0.5711286067962646, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0288, + "step": 21870 + }, + { + "epoch": 0.6255897069335239, + "grad_norm": 0.5108116865158081, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0325, + "step": 21880 + }, + { + "epoch": 0.6258756254467477, + "grad_norm": 0.49562424421310425, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0306, + "step": 21890 + }, + { + "epoch": 0.6261615439599714, + "grad_norm": 0.3392108976840973, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0358, + "step": 21900 + }, + { + "epoch": 0.6264474624731952, + "grad_norm": 1.0588114261627197, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0404, + "step": 21910 + }, + { + "epoch": 0.6267333809864188, + "grad_norm": 0.6979959607124329, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0349, + "step": 21920 + }, + { + "epoch": 0.6270192994996426, + "grad_norm": 0.3185918927192688, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0276, + "step": 21930 + }, + { + "epoch": 0.6273052180128663, + "grad_norm": 0.3921501338481903, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0326, + "step": 21940 + }, + { + "epoch": 0.6275911365260901, + "grad_norm": 0.9666212797164917, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0346, + "step": 21950 + }, + { + "epoch": 0.6278770550393138, + "grad_norm": 0.4483211040496826, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0306, + "step": 21960 + }, + { + "epoch": 0.6281629735525375, + "grad_norm": 0.4839077293872833, + "learning_rate": 5.196592054173714e-06, + "loss": 0.026, + "step": 21970 + }, + { + "epoch": 0.6284488920657613, + "grad_norm": 0.5054528117179871, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0299, + "step": 21980 + }, + { + "epoch": 0.628734810578985, + "grad_norm": 0.5953076481819153, + "learning_rate": 5.181701567303612e-06, + "loss": 0.036, + "step": 21990 + }, + { + "epoch": 0.6290207290922087, + "grad_norm": 0.39300060272216797, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0358, + "step": 22000 + }, + { + "epoch": 0.6293066476054324, + "grad_norm": 0.42864665389060974, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0363, + "step": 22010 + }, + { + "epoch": 0.6295925661186562, + "grad_norm": 0.33609238266944885, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0398, + "step": 22020 + }, + { + "epoch": 0.6298784846318799, + "grad_norm": 0.4237107038497925, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0319, + "step": 22030 + }, + { + "epoch": 0.6301644031451037, + "grad_norm": 0.42774054408073425, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0332, + "step": 22040 + }, + { + "epoch": 0.6304503216583274, + "grad_norm": 0.8992825150489807, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0396, + "step": 22050 + }, + { + "epoch": 0.630736240171551, + "grad_norm": 0.20832861959934235, + "learning_rate": 5.129800405815733e-06, + "loss": 0.03, + "step": 22060 + }, + { + "epoch": 0.6310221586847748, + "grad_norm": 0.5961321592330933, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0429, + "step": 22070 + }, + { + "epoch": 0.6313080771979985, + "grad_norm": 0.5037736296653748, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0332, + "step": 22080 + }, + { + "epoch": 0.6315939957112223, + "grad_norm": 0.383732408285141, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0293, + "step": 22090 + }, + { + "epoch": 0.631879914224446, + "grad_norm": 0.8124368786811829, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0286, + "step": 22100 + }, + { + "epoch": 0.6321658327376698, + "grad_norm": 0.96833735704422, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0346, + "step": 22110 + }, + { + "epoch": 0.6324517512508935, + "grad_norm": 0.42382001876831055, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0345, + "step": 22120 + }, + { + "epoch": 0.6327376697641173, + "grad_norm": 0.5928776860237122, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0314, + "step": 22130 + }, + { + "epoch": 0.633023588277341, + "grad_norm": 0.7822670340538025, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0335, + "step": 22140 + }, + { + "epoch": 0.6333095067905646, + "grad_norm": 0.6383520364761353, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0321, + "step": 22150 + }, + { + "epoch": 0.6335954253037884, + "grad_norm": 0.3413240611553192, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0314, + "step": 22160 + }, + { + "epoch": 0.6338813438170121, + "grad_norm": 0.5960783958435059, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0385, + "step": 22170 + }, + { + "epoch": 0.6341672623302359, + "grad_norm": 0.2557702660560608, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0316, + "step": 22180 + }, + { + "epoch": 0.6344531808434596, + "grad_norm": 0.6229982376098633, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0325, + "step": 22190 + }, + { + "epoch": 0.6347390993566834, + "grad_norm": 0.5080077052116394, + "learning_rate": 5.027013727107874e-06, + "loss": 0.036, + "step": 22200 + }, + { + "epoch": 0.6350250178699071, + "grad_norm": 0.5630851984024048, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0279, + "step": 22210 + }, + { + "epoch": 0.6353109363831309, + "grad_norm": 0.81584233045578, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0343, + "step": 22220 + }, + { + "epoch": 0.6355968548963545, + "grad_norm": 0.3985321521759033, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0289, + "step": 22230 + }, + { + "epoch": 0.6358827734095782, + "grad_norm": 0.4481184482574463, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0345, + "step": 22240 + }, + { + "epoch": 0.636168691922802, + "grad_norm": 0.3640075623989105, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0307, + "step": 22250 + }, + { + "epoch": 0.6364546104360257, + "grad_norm": 0.4006771147251129, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0381, + "step": 22260 + }, + { + "epoch": 0.6367405289492495, + "grad_norm": 0.7638134360313416, + "learning_rate": 4.976134120528886e-06, + "loss": 0.039, + "step": 22270 + }, + { + "epoch": 0.6370264474624732, + "grad_norm": 0.4820837080478668, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0281, + "step": 22280 + }, + { + "epoch": 0.637312365975697, + "grad_norm": 0.5928444266319275, + "learning_rate": 4.961660586405147e-06, + "loss": 0.033, + "step": 22290 + }, + { + "epoch": 0.6375982844889206, + "grad_norm": 0.50687575340271, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0357, + "step": 22300 + }, + { + "epoch": 0.6378842030021444, + "grad_norm": 0.673939049243927, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0301, + "step": 22310 + }, + { + "epoch": 0.6381701215153681, + "grad_norm": 0.4300031065940857, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.029, + "step": 22320 + }, + { + "epoch": 0.6384560400285918, + "grad_norm": 0.6585102677345276, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0333, + "step": 22330 + }, + { + "epoch": 0.6387419585418156, + "grad_norm": 0.6430448889732361, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0345, + "step": 22340 + }, + { + "epoch": 0.6390278770550393, + "grad_norm": 0.8272712826728821, + "learning_rate": 4.918410326949594e-06, + "loss": 0.034, + "step": 22350 + }, + { + "epoch": 0.6393137955682631, + "grad_norm": 0.7631726861000061, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0321, + "step": 22360 + }, + { + "epoch": 0.6395997140814867, + "grad_norm": 0.5562252402305603, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0339, + "step": 22370 + }, + { + "epoch": 0.6398856325947105, + "grad_norm": 0.6027814149856567, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0355, + "step": 22380 + }, + { + "epoch": 0.6401715511079342, + "grad_norm": 0.3548984229564667, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0343, + "step": 22390 + }, + { + "epoch": 0.640457469621158, + "grad_norm": 0.4959709346294403, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.031, + "step": 22400 + }, + { + "epoch": 0.6407433881343817, + "grad_norm": 0.3765028715133667, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0406, + "step": 22410 + }, + { + "epoch": 0.6410293066476054, + "grad_norm": 0.5014662146568298, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0297, + "step": 22420 + }, + { + "epoch": 0.6413152251608292, + "grad_norm": 0.5085675716400146, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0324, + "step": 22430 + }, + { + "epoch": 0.6416011436740529, + "grad_norm": 0.37595826387405396, + "learning_rate": 4.854017257346105e-06, + "loss": 0.033, + "step": 22440 + }, + { + "epoch": 0.6418870621872766, + "grad_norm": 0.5408678650856018, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0323, + "step": 22450 + }, + { + "epoch": 0.6421729807005003, + "grad_norm": 0.4319652020931244, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0269, + "step": 22460 + }, + { + "epoch": 0.6424588992137241, + "grad_norm": 0.41388124227523804, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0311, + "step": 22470 + }, + { + "epoch": 0.6427448177269478, + "grad_norm": 0.4778555631637573, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0332, + "step": 22480 + }, + { + "epoch": 0.6430307362401716, + "grad_norm": 0.38835474848747253, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0304, + "step": 22490 + }, + { + "epoch": 0.6433166547533953, + "grad_norm": 0.5165611505508423, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0344, + "step": 22500 + }, + { + "epoch": 0.643602573266619, + "grad_norm": 0.4285198450088501, + "learning_rate": 4.804337352679613e-06, + "loss": 0.035, + "step": 22510 + }, + { + "epoch": 0.6438884917798428, + "grad_norm": 0.4512922167778015, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0354, + "step": 22520 + }, + { + "epoch": 0.6441744102930664, + "grad_norm": 0.33437663316726685, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0343, + "step": 22530 + }, + { + "epoch": 0.6444603288062902, + "grad_norm": 0.45291104912757874, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0307, + "step": 22540 + }, + { + "epoch": 0.6447462473195139, + "grad_norm": 0.5920093655586243, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0338, + "step": 22550 + }, + { + "epoch": 0.6450321658327377, + "grad_norm": 0.6362392902374268, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0335, + "step": 22560 + }, + { + "epoch": 0.6453180843459614, + "grad_norm": 0.28033652901649475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0267, + "step": 22570 + }, + { + "epoch": 0.6456040028591852, + "grad_norm": 0.4563148617744446, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0316, + "step": 22580 + }, + { + "epoch": 0.6458899213724089, + "grad_norm": 0.4889507591724396, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.034, + "step": 22590 + }, + { + "epoch": 0.6461758398856325, + "grad_norm": 0.6826061010360718, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0316, + "step": 22600 + }, + { + "epoch": 0.6464617583988563, + "grad_norm": 0.45066431164741516, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0287, + "step": 22610 + }, + { + "epoch": 0.64674767691208, + "grad_norm": 0.41994187235832214, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0312, + "step": 22620 + }, + { + "epoch": 0.6470335954253038, + "grad_norm": 0.39731675386428833, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0347, + "step": 22630 + }, + { + "epoch": 0.6473195139385275, + "grad_norm": 0.5207498073577881, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0304, + "step": 22640 + }, + { + "epoch": 0.6476054324517513, + "grad_norm": 0.42930668592453003, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0344, + "step": 22650 + }, + { + "epoch": 0.647891350964975, + "grad_norm": 0.3023674488067627, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0359, + "step": 22660 + }, + { + "epoch": 0.6481772694781988, + "grad_norm": 0.43205010890960693, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0323, + "step": 22670 + }, + { + "epoch": 0.6484631879914224, + "grad_norm": 0.5984707474708557, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0318, + "step": 22680 + }, + { + "epoch": 0.6487491065046461, + "grad_norm": 0.43477800488471985, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0346, + "step": 22690 + }, + { + "epoch": 0.6490350250178699, + "grad_norm": 0.3570900857448578, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0342, + "step": 22700 + }, + { + "epoch": 0.6493209435310936, + "grad_norm": 0.47367945313453674, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0367, + "step": 22710 + }, + { + "epoch": 0.6496068620443174, + "grad_norm": 0.3768099844455719, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0357, + "step": 22720 + }, + { + "epoch": 0.6498927805575411, + "grad_norm": 0.6188724040985107, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0299, + "step": 22730 + }, + { + "epoch": 0.6501786990707649, + "grad_norm": 0.5733038783073425, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0343, + "step": 22740 + }, + { + "epoch": 0.6504646175839885, + "grad_norm": 0.5000156164169312, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0284, + "step": 22750 + }, + { + "epoch": 0.6507505360972123, + "grad_norm": 0.22813546657562256, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0288, + "step": 22760 + }, + { + "epoch": 0.651036454610436, + "grad_norm": 0.4805088937282562, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0305, + "step": 22770 + }, + { + "epoch": 0.6513223731236597, + "grad_norm": 0.4652612507343292, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0304, + "step": 22780 + }, + { + "epoch": 0.6516082916368835, + "grad_norm": 0.5010579824447632, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0337, + "step": 22790 + }, + { + "epoch": 0.6518942101501072, + "grad_norm": 0.36260518431663513, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0284, + "step": 22800 + }, + { + "epoch": 0.652180128663331, + "grad_norm": 0.45098820328712463, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0297, + "step": 22810 + }, + { + "epoch": 0.6524660471765547, + "grad_norm": 0.6154504418373108, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0366, + "step": 22820 + }, + { + "epoch": 0.6527519656897784, + "grad_norm": 0.4522152543067932, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.033, + "step": 22830 + }, + { + "epoch": 0.6530378842030021, + "grad_norm": 0.34195253252983093, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0304, + "step": 22840 + }, + { + "epoch": 0.6533238027162259, + "grad_norm": 0.49787941575050354, + "learning_rate": 4.568154392147005e-06, + "loss": 0.033, + "step": 22850 + }, + { + "epoch": 0.6536097212294496, + "grad_norm": 0.5249335765838623, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0381, + "step": 22860 + }, + { + "epoch": 0.6538956397426733, + "grad_norm": 0.7645581960678101, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0298, + "step": 22870 + }, + { + "epoch": 0.6541815582558971, + "grad_norm": 0.6034232974052429, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0313, + "step": 22880 + }, + { + "epoch": 0.6544674767691208, + "grad_norm": 0.3499184846878052, + "learning_rate": 4.54093567906903e-06, + "loss": 0.036, + "step": 22890 + }, + { + "epoch": 0.6547533952823446, + "grad_norm": 0.4157135486602783, + "learning_rate": 4.534149931036931e-06, + "loss": 0.033, + "step": 22900 + }, + { + "epoch": 0.6550393137955682, + "grad_norm": 0.4563712775707245, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0361, + "step": 22910 + }, + { + "epoch": 0.655325232308792, + "grad_norm": 1.080802321434021, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0307, + "step": 22920 + }, + { + "epoch": 0.6556111508220157, + "grad_norm": 0.38259357213974, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0292, + "step": 22930 + }, + { + "epoch": 0.6558970693352395, + "grad_norm": 0.6920587420463562, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0322, + "step": 22940 + }, + { + "epoch": 0.6561829878484632, + "grad_norm": 0.628978967666626, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0391, + "step": 22950 + }, + { + "epoch": 0.6564689063616869, + "grad_norm": 0.4848436713218689, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0306, + "step": 22960 + }, + { + "epoch": 0.6567548248749107, + "grad_norm": 0.4478876292705536, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0334, + "step": 22970 + }, + { + "epoch": 0.6570407433881343, + "grad_norm": 0.47360673546791077, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0357, + "step": 22980 + }, + { + "epoch": 0.6573266619013581, + "grad_norm": 0.32840496301651, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0339, + "step": 22990 + }, + { + "epoch": 0.6576125804145818, + "grad_norm": 0.4047236442565918, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0321, + "step": 23000 + }, + { + "epoch": 0.6578984989278056, + "grad_norm": 0.7817053198814392, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0375, + "step": 23010 + }, + { + "epoch": 0.6581844174410293, + "grad_norm": 0.38985809683799744, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0343, + "step": 23020 + }, + { + "epoch": 0.6584703359542531, + "grad_norm": 0.45360830426216125, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0287, + "step": 23030 + }, + { + "epoch": 0.6587562544674768, + "grad_norm": 0.2886345088481903, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0322, + "step": 23040 + }, + { + "epoch": 0.6590421729807004, + "grad_norm": 0.8546258211135864, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0331, + "step": 23050 + }, + { + "epoch": 0.6593280914939242, + "grad_norm": 0.48426172137260437, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0343, + "step": 23060 + }, + { + "epoch": 0.6596140100071479, + "grad_norm": 0.46379074454307556, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0296, + "step": 23070 + }, + { + "epoch": 0.6598999285203717, + "grad_norm": 0.7772185206413269, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0319, + "step": 23080 + }, + { + "epoch": 0.6601858470335954, + "grad_norm": 0.4606277644634247, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0336, + "step": 23090 + }, + { + "epoch": 0.6604717655468192, + "grad_norm": 0.43342530727386475, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0287, + "step": 23100 + }, + { + "epoch": 0.6607576840600429, + "grad_norm": 0.385151207447052, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0363, + "step": 23110 + }, + { + "epoch": 0.6610436025732667, + "grad_norm": 0.3960207998752594, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0306, + "step": 23120 + }, + { + "epoch": 0.6613295210864903, + "grad_norm": 0.41210439801216125, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0348, + "step": 23130 + }, + { + "epoch": 0.661615439599714, + "grad_norm": 0.41976168751716614, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0272, + "step": 23140 + }, + { + "epoch": 0.6619013581129378, + "grad_norm": 0.3195948004722595, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0362, + "step": 23150 + }, + { + "epoch": 0.6621872766261615, + "grad_norm": 0.7024016380310059, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0316, + "step": 23160 + }, + { + "epoch": 0.6624731951393853, + "grad_norm": 0.2894183099269867, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0339, + "step": 23170 + }, + { + "epoch": 0.662759113652609, + "grad_norm": 0.489715576171875, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0272, + "step": 23180 + }, + { + "epoch": 0.6630450321658328, + "grad_norm": 0.3406641185283661, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0272, + "step": 23190 + }, + { + "epoch": 0.6633309506790565, + "grad_norm": 0.3647848963737488, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0337, + "step": 23200 + }, + { + "epoch": 0.6636168691922802, + "grad_norm": 0.7023333311080933, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0334, + "step": 23210 + }, + { + "epoch": 0.6639027877055039, + "grad_norm": 0.43989211320877075, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0313, + "step": 23220 + }, + { + "epoch": 0.6641887062187276, + "grad_norm": 0.7329099774360657, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0283, + "step": 23230 + }, + { + "epoch": 0.6644746247319514, + "grad_norm": 0.3954019546508789, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0321, + "step": 23240 + }, + { + "epoch": 0.6647605432451751, + "grad_norm": 0.38020703196525574, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0337, + "step": 23250 + }, + { + "epoch": 0.6650464617583989, + "grad_norm": 0.5988985300064087, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0353, + "step": 23260 + }, + { + "epoch": 0.6653323802716226, + "grad_norm": 0.4259869158267975, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0316, + "step": 23270 + }, + { + "epoch": 0.6656182987848464, + "grad_norm": 0.4322545528411865, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0318, + "step": 23280 + }, + { + "epoch": 0.66590421729807, + "grad_norm": 0.40275540947914124, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0344, + "step": 23290 + }, + { + "epoch": 0.6661901358112938, + "grad_norm": 0.5070827603340149, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0336, + "step": 23300 + }, + { + "epoch": 0.6664760543245175, + "grad_norm": 0.614973247051239, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0352, + "step": 23310 + }, + { + "epoch": 0.6667619728377412, + "grad_norm": 0.4637722074985504, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0277, + "step": 23320 + }, + { + "epoch": 0.667047891350965, + "grad_norm": 0.34951677918434143, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0284, + "step": 23330 + }, + { + "epoch": 0.6673338098641887, + "grad_norm": 0.5609407424926758, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0304, + "step": 23340 + }, + { + "epoch": 0.6676197283774125, + "grad_norm": 0.44585973024368286, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0263, + "step": 23350 + }, + { + "epoch": 0.6679056468906361, + "grad_norm": 0.5311269760131836, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0311, + "step": 23360 + }, + { + "epoch": 0.6681915654038599, + "grad_norm": 0.4923100471496582, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0277, + "step": 23370 + }, + { + "epoch": 0.6684774839170836, + "grad_norm": 0.5254819989204407, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0328, + "step": 23380 + }, + { + "epoch": 0.6687634024303074, + "grad_norm": 0.47537869215011597, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0373, + "step": 23390 + }, + { + "epoch": 0.6690493209435311, + "grad_norm": 0.40087464451789856, + "learning_rate": 4.204700678381975e-06, + "loss": 0.034, + "step": 23400 + }, + { + "epoch": 0.6693352394567548, + "grad_norm": 0.5166190266609192, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0314, + "step": 23410 + }, + { + "epoch": 0.6696211579699786, + "grad_norm": 0.42874693870544434, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0279, + "step": 23420 + }, + { + "epoch": 0.6699070764832022, + "grad_norm": 0.3685651123523712, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0313, + "step": 23430 + }, + { + "epoch": 0.670192994996426, + "grad_norm": 0.5417486429214478, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.033, + "step": 23440 + }, + { + "epoch": 0.6704789135096497, + "grad_norm": 0.5764726996421814, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0368, + "step": 23450 + }, + { + "epoch": 0.6707648320228735, + "grad_norm": 0.44168850779533386, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0258, + "step": 23460 + }, + { + "epoch": 0.6710507505360972, + "grad_norm": 0.39990919828414917, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0403, + "step": 23470 + }, + { + "epoch": 0.671336669049321, + "grad_norm": 0.7526253461837769, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0334, + "step": 23480 + }, + { + "epoch": 0.6716225875625447, + "grad_norm": 0.4888451397418976, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0314, + "step": 23490 + }, + { + "epoch": 0.6719085060757684, + "grad_norm": 0.5732892751693726, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0277, + "step": 23500 + }, + { + "epoch": 0.6721944245889921, + "grad_norm": 0.5806633830070496, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0347, + "step": 23510 + }, + { + "epoch": 0.6724803431022158, + "grad_norm": 0.4336501657962799, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0369, + "step": 23520 + }, + { + "epoch": 0.6727662616154396, + "grad_norm": 0.47082582116127014, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0408, + "step": 23530 + }, + { + "epoch": 0.6730521801286633, + "grad_norm": 0.6571422815322876, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0316, + "step": 23540 + }, + { + "epoch": 0.6733380986418871, + "grad_norm": 0.4899539649486542, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0256, + "step": 23550 + }, + { + "epoch": 0.6736240171551108, + "grad_norm": 0.3201868236064911, + "learning_rate": 4.103441847743051e-06, + "loss": 0.029, + "step": 23560 + }, + { + "epoch": 0.6739099356683346, + "grad_norm": 0.4385588765144348, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0284, + "step": 23570 + }, + { + "epoch": 0.6741958541815583, + "grad_norm": 0.5079174637794495, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0298, + "step": 23580 + }, + { + "epoch": 0.6744817726947819, + "grad_norm": 0.609523355960846, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0313, + "step": 23590 + }, + { + "epoch": 0.6747676912080057, + "grad_norm": 0.487690269947052, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0246, + "step": 23600 + }, + { + "epoch": 0.6750536097212294, + "grad_norm": 0.5146880745887756, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0319, + "step": 23610 + }, + { + "epoch": 0.6753395282344532, + "grad_norm": 0.5848239064216614, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0282, + "step": 23620 + }, + { + "epoch": 0.6756254467476769, + "grad_norm": 0.7779616117477417, + "learning_rate": 4.05979084812184e-06, + "loss": 0.033, + "step": 23630 + }, + { + "epoch": 0.6759113652609007, + "grad_norm": 0.3329331576824188, + "learning_rate": 4.053587511509546e-06, + "loss": 0.028, + "step": 23640 + }, + { + "epoch": 0.6761972837741244, + "grad_norm": 0.4691336154937744, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0313, + "step": 23650 + }, + { + "epoch": 0.6764832022873482, + "grad_norm": 0.47258421778678894, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0326, + "step": 23660 + }, + { + "epoch": 0.6767691208005718, + "grad_norm": 0.5333718657493591, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0332, + "step": 23670 + }, + { + "epoch": 0.6770550393137955, + "grad_norm": 0.7278451323509216, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0409, + "step": 23680 + }, + { + "epoch": 0.6773409578270193, + "grad_norm": 0.41567277908325195, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0263, + "step": 23690 + }, + { + "epoch": 0.677626876340243, + "grad_norm": 0.4351106584072113, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0276, + "step": 23700 + }, + { + "epoch": 0.6779127948534668, + "grad_norm": 0.31096217036247253, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0339, + "step": 23710 + }, + { + "epoch": 0.6781987133666905, + "grad_norm": 0.6321837306022644, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0313, + "step": 23720 + }, + { + "epoch": 0.6784846318799143, + "grad_norm": 0.5278098583221436, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0294, + "step": 23730 + }, + { + "epoch": 0.6787705503931379, + "grad_norm": 0.5778757333755493, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0324, + "step": 23740 + }, + { + "epoch": 0.6790564689063617, + "grad_norm": 0.6164223551750183, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0316, + "step": 23750 + }, + { + "epoch": 0.6793423874195854, + "grad_norm": 0.2872319221496582, + "learning_rate": 3.979785400791052e-06, + "loss": 0.034, + "step": 23760 + }, + { + "epoch": 0.6796283059328091, + "grad_norm": 0.6088704466819763, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0317, + "step": 23770 + }, + { + "epoch": 0.6799142244460329, + "grad_norm": 0.4733040928840637, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0282, + "step": 23780 + }, + { + "epoch": 0.6802001429592566, + "grad_norm": 1.3417131900787354, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0304, + "step": 23790 + }, + { + "epoch": 0.6804860614724804, + "grad_norm": 0.7316146492958069, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0311, + "step": 23800 + }, + { + "epoch": 0.680771979985704, + "grad_norm": 0.5726248025894165, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0323, + "step": 23810 + }, + { + "epoch": 0.6810578984989278, + "grad_norm": 0.3990941345691681, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0277, + "step": 23820 + }, + { + "epoch": 0.6813438170121515, + "grad_norm": 0.49237731099128723, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0287, + "step": 23830 + }, + { + "epoch": 0.6816297355253753, + "grad_norm": 0.47560542821884155, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0298, + "step": 23840 + }, + { + "epoch": 0.681915654038599, + "grad_norm": 0.5967867374420166, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0295, + "step": 23850 + }, + { + "epoch": 0.6822015725518227, + "grad_norm": 0.5726722478866577, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0283, + "step": 23860 + }, + { + "epoch": 0.6824874910650465, + "grad_norm": 0.282678484916687, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0303, + "step": 23870 + }, + { + "epoch": 0.6827734095782702, + "grad_norm": 0.4432118237018585, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0296, + "step": 23880 + }, + { + "epoch": 0.683059328091494, + "grad_norm": 0.33677008748054504, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0379, + "step": 23890 + }, + { + "epoch": 0.6833452466047176, + "grad_norm": 0.5063587427139282, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0281, + "step": 23900 + }, + { + "epoch": 0.6836311651179414, + "grad_norm": 0.2592383921146393, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0263, + "step": 23910 + }, + { + "epoch": 0.6839170836311651, + "grad_norm": 0.4482796788215637, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0289, + "step": 23920 + }, + { + "epoch": 0.6842030021443889, + "grad_norm": 0.2609167993068695, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0294, + "step": 23930 + }, + { + "epoch": 0.6844889206576126, + "grad_norm": 0.36982619762420654, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0306, + "step": 23940 + }, + { + "epoch": 0.6847748391708363, + "grad_norm": 0.47758495807647705, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0273, + "step": 23950 + }, + { + "epoch": 0.68506075768406, + "grad_norm": 0.5566948652267456, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0318, + "step": 23960 + }, + { + "epoch": 0.6853466761972837, + "grad_norm": 0.7815461754798889, + "learning_rate": 3.853493736024934e-06, + "loss": 0.03, + "step": 23970 + }, + { + "epoch": 0.6856325947105075, + "grad_norm": 0.42888402938842773, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0384, + "step": 23980 + }, + { + "epoch": 0.6859185132237312, + "grad_norm": 0.47878748178482056, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0356, + "step": 23990 + }, + { + "epoch": 0.686204431736955, + "grad_norm": 0.3847522735595703, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0272, + "step": 24000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.499133370171392e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..318a917053a2f3911c0745144c2ffe9ae051202a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a529edd71f55b14d491100e168161c7e6fd116da52fe82191eb3eece70b303fa +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ee2b104aa4c7a1d09d4d3f05c297cb68785e43c --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1605578d38479da5748cdf3f05c48a0482444ebbbc6b852782d33dc63f43b74 +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..96c526ab5b145c5950a79ceda8fb32221936fb8a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c4cae7294526df814ae6d9da83393c70606e096b92d80b2c89bbb7a0dc13284 +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a6c028da160f73a63d855edc7f42f451d9ab1916 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json @@ -0,0 +1,18234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7433881343817013, + "eval_steps": 500, + "global_step": 26000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + }, + { + "epoch": 0.40057183702644744, + "grad_norm": 0.9077253937721252, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0415, + "step": 14010 + }, + { + "epoch": 0.4008577555396712, + "grad_norm": 0.9126781225204468, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.048, + "step": 14020 + }, + { + "epoch": 0.4011436740528949, + "grad_norm": 0.6264623999595642, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0411, + "step": 14030 + }, + { + "epoch": 0.40142959256611865, + "grad_norm": 0.523853600025177, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.051, + "step": 14040 + }, + { + "epoch": 0.4017155110793424, + "grad_norm": 0.6340035200119019, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0426, + "step": 14050 + }, + { + "epoch": 0.40200142959256613, + "grad_norm": 0.3594725430011749, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0397, + "step": 14060 + }, + { + "epoch": 0.40228734810578987, + "grad_norm": 0.941470742225647, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0402, + "step": 14070 + }, + { + "epoch": 0.4025732666190136, + "grad_norm": 0.840506911277771, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0473, + "step": 14080 + }, + { + "epoch": 0.4028591851322373, + "grad_norm": 0.3359200954437256, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0405, + "step": 14090 + }, + { + "epoch": 0.403145103645461, + "grad_norm": 0.49658629298210144, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0464, + "step": 14100 + }, + { + "epoch": 0.40343102215868476, + "grad_norm": 0.7940187454223633, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0417, + "step": 14110 + }, + { + "epoch": 0.4037169406719085, + "grad_norm": 0.30110660195350647, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0371, + "step": 14120 + }, + { + "epoch": 0.40400285918513223, + "grad_norm": 0.42845240235328674, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.053, + "step": 14130 + }, + { + "epoch": 0.40428877769835597, + "grad_norm": 0.997348427772522, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.041, + "step": 14140 + }, + { + "epoch": 0.4045746962115797, + "grad_norm": 0.4759966731071472, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0377, + "step": 14150 + }, + { + "epoch": 0.40486061472480345, + "grad_norm": 0.42045602202415466, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0397, + "step": 14160 + }, + { + "epoch": 0.4051465332380272, + "grad_norm": 0.6400002837181091, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0507, + "step": 14170 + }, + { + "epoch": 0.40543245175125087, + "grad_norm": 0.5473673939704895, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0359, + "step": 14180 + }, + { + "epoch": 0.4057183702644746, + "grad_norm": 0.7414730787277222, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0416, + "step": 14190 + }, + { + "epoch": 0.40600428877769834, + "grad_norm": 0.4691861867904663, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0363, + "step": 14200 + }, + { + "epoch": 0.4062902072909221, + "grad_norm": 0.9186112880706787, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0445, + "step": 14210 + }, + { + "epoch": 0.4065761258041458, + "grad_norm": 0.6782190203666687, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.40686204431736955, + "grad_norm": 0.6948013305664062, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.037, + "step": 14230 + }, + { + "epoch": 0.4071479628305933, + "grad_norm": 0.3034680485725403, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0371, + "step": 14240 + }, + { + "epoch": 0.40743388134381703, + "grad_norm": 0.4254174828529358, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0449, + "step": 14250 + }, + { + "epoch": 0.40771979985704077, + "grad_norm": 1.3622064590454102, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0428, + "step": 14260 + }, + { + "epoch": 0.40800571837026445, + "grad_norm": 0.5928359031677246, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0424, + "step": 14270 + }, + { + "epoch": 0.4082916368834882, + "grad_norm": 0.9103132486343384, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0414, + "step": 14280 + }, + { + "epoch": 0.4085775553967119, + "grad_norm": 0.6338028311729431, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0376, + "step": 14290 + }, + { + "epoch": 0.40886347390993566, + "grad_norm": 0.9920284748077393, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0393, + "step": 14300 + }, + { + "epoch": 0.4091493924231594, + "grad_norm": 0.411830335855484, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0336, + "step": 14310 + }, + { + "epoch": 0.40943531093638313, + "grad_norm": 0.6977682709693909, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0454, + "step": 14320 + }, + { + "epoch": 0.40972122944960687, + "grad_norm": 0.6303663849830627, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0453, + "step": 14330 + }, + { + "epoch": 0.4100071479628306, + "grad_norm": 0.3048207759857178, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0373, + "step": 14340 + }, + { + "epoch": 0.41029306647605435, + "grad_norm": 0.7683395743370056, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0438, + "step": 14350 + }, + { + "epoch": 0.41057898498927803, + "grad_norm": 0.5791511535644531, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0392, + "step": 14360 + }, + { + "epoch": 0.41086490350250177, + "grad_norm": 0.876626193523407, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0324, + "step": 14370 + }, + { + "epoch": 0.4111508220157255, + "grad_norm": 0.5971815586090088, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0368, + "step": 14380 + }, + { + "epoch": 0.41143674052894924, + "grad_norm": 0.6508862376213074, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0411, + "step": 14390 + }, + { + "epoch": 0.411722659042173, + "grad_norm": 0.4704359471797943, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0351, + "step": 14400 + }, + { + "epoch": 0.4120085775553967, + "grad_norm": 0.4266453683376312, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0367, + "step": 14410 + }, + { + "epoch": 0.41229449606862045, + "grad_norm": 0.5898434519767761, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0376, + "step": 14420 + }, + { + "epoch": 0.4125804145818442, + "grad_norm": 0.8741532564163208, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0419, + "step": 14430 + }, + { + "epoch": 0.41286633309506793, + "grad_norm": 0.24328190088272095, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0333, + "step": 14440 + }, + { + "epoch": 0.4131522516082916, + "grad_norm": 0.4263601303100586, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.039, + "step": 14450 + }, + { + "epoch": 0.41343817012151535, + "grad_norm": 0.6311615109443665, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0454, + "step": 14460 + }, + { + "epoch": 0.4137240886347391, + "grad_norm": 0.7424519658088684, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0392, + "step": 14470 + }, + { + "epoch": 0.4140100071479628, + "grad_norm": 0.48323145508766174, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0374, + "step": 14480 + }, + { + "epoch": 0.41429592566118656, + "grad_norm": 0.38597407937049866, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0393, + "step": 14490 + }, + { + "epoch": 0.4145818441744103, + "grad_norm": 0.7251518964767456, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0431, + "step": 14500 + }, + { + "epoch": 0.41486776268763403, + "grad_norm": 0.44361060857772827, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0426, + "step": 14510 + }, + { + "epoch": 0.41515368120085777, + "grad_norm": 0.5625014305114746, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0372, + "step": 14520 + }, + { + "epoch": 0.4154395997140815, + "grad_norm": 0.27855798602104187, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 0.4157255182273052, + "grad_norm": 0.5966296195983887, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0387, + "step": 14540 + }, + { + "epoch": 0.41601143674052893, + "grad_norm": 0.49445512890815735, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0355, + "step": 14550 + }, + { + "epoch": 0.41629735525375267, + "grad_norm": 0.3813278377056122, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0456, + "step": 14560 + }, + { + "epoch": 0.4165832737669764, + "grad_norm": 0.5962988138198853, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0401, + "step": 14570 + }, + { + "epoch": 0.41686919228020014, + "grad_norm": 0.4028547406196594, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0371, + "step": 14580 + }, + { + "epoch": 0.4171551107934239, + "grad_norm": 1.348706841468811, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0426, + "step": 14590 + }, + { + "epoch": 0.4174410293066476, + "grad_norm": 1.2782070636749268, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0393, + "step": 14600 + }, + { + "epoch": 0.41772694781987135, + "grad_norm": 1.0024999380111694, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0436, + "step": 14610 + }, + { + "epoch": 0.4180128663330951, + "grad_norm": 0.35450127720832825, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0411, + "step": 14620 + }, + { + "epoch": 0.41829878484631877, + "grad_norm": 0.5827250480651855, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0372, + "step": 14630 + }, + { + "epoch": 0.4185847033595425, + "grad_norm": 0.5905774235725403, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0394, + "step": 14640 + }, + { + "epoch": 0.41887062187276625, + "grad_norm": 0.652074933052063, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0405, + "step": 14650 + }, + { + "epoch": 0.41915654038599, + "grad_norm": 0.7245490550994873, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0473, + "step": 14660 + }, + { + "epoch": 0.4194424588992137, + "grad_norm": 0.5153012871742249, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.043, + "step": 14670 + }, + { + "epoch": 0.41972837741243746, + "grad_norm": 0.516107976436615, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0434, + "step": 14680 + }, + { + "epoch": 0.4200142959256612, + "grad_norm": 0.4743354618549347, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0429, + "step": 14690 + }, + { + "epoch": 0.42030021443888493, + "grad_norm": 0.547875165939331, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0395, + "step": 14700 + }, + { + "epoch": 0.42058613295210867, + "grad_norm": 0.6398400068283081, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0384, + "step": 14710 + }, + { + "epoch": 0.42087205146533235, + "grad_norm": 0.5891467332839966, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0399, + "step": 14720 + }, + { + "epoch": 0.4211579699785561, + "grad_norm": 0.3927595615386963, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0353, + "step": 14730 + }, + { + "epoch": 0.42144388849177983, + "grad_norm": 0.6477030515670776, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0492, + "step": 14740 + }, + { + "epoch": 0.42172980700500357, + "grad_norm": 0.7090615034103394, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.042, + "step": 14750 + }, + { + "epoch": 0.4220157255182273, + "grad_norm": 0.6572134494781494, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0406, + "step": 14760 + }, + { + "epoch": 0.42230164403145104, + "grad_norm": 0.787663996219635, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0424, + "step": 14770 + }, + { + "epoch": 0.4225875625446748, + "grad_norm": 0.8419309258460999, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0427, + "step": 14780 + }, + { + "epoch": 0.4228734810578985, + "grad_norm": 0.6204128861427307, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0364, + "step": 14790 + }, + { + "epoch": 0.42315939957112225, + "grad_norm": 0.7446070313453674, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 0.42344531808434593, + "grad_norm": 0.7446451783180237, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0384, + "step": 14810 + }, + { + "epoch": 0.42373123659756967, + "grad_norm": 0.6946475505828857, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0375, + "step": 14820 + }, + { + "epoch": 0.4240171551107934, + "grad_norm": 0.6997008323669434, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0393, + "step": 14830 + }, + { + "epoch": 0.42430307362401715, + "grad_norm": 0.4857316315174103, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0474, + "step": 14840 + }, + { + "epoch": 0.4245889921372409, + "grad_norm": 1.3516888618469238, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.047, + "step": 14850 + }, + { + "epoch": 0.4248749106504646, + "grad_norm": 0.40320220589637756, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0418, + "step": 14860 + }, + { + "epoch": 0.42516082916368836, + "grad_norm": 0.9002796411514282, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0434, + "step": 14870 + }, + { + "epoch": 0.4254467476769121, + "grad_norm": 0.3810071349143982, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0338, + "step": 14880 + }, + { + "epoch": 0.42573266619013583, + "grad_norm": 0.5786157250404358, + "learning_rate": 1.159527607963768e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.4260185847033595, + "grad_norm": 0.6316869258880615, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0388, + "step": 14900 + }, + { + "epoch": 0.42630450321658325, + "grad_norm": 0.608745276927948, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0426, + "step": 14910 + }, + { + "epoch": 0.426590421729807, + "grad_norm": 0.6655036807060242, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0433, + "step": 14920 + }, + { + "epoch": 0.4268763402430307, + "grad_norm": 0.29059523344039917, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0507, + "step": 14930 + }, + { + "epoch": 0.42716225875625446, + "grad_norm": 0.9066076278686523, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.4274481772694782, + "grad_norm": 1.0660220384597778, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0512, + "step": 14950 + }, + { + "epoch": 0.42773409578270194, + "grad_norm": 0.6081144213676453, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0426, + "step": 14960 + }, + { + "epoch": 0.4280200142959257, + "grad_norm": 0.46524369716644287, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0435, + "step": 14970 + }, + { + "epoch": 0.4283059328091494, + "grad_norm": 0.3497388958930969, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0492, + "step": 14980 + }, + { + "epoch": 0.4285918513223731, + "grad_norm": 0.41300803422927856, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 0.42887776983559683, + "grad_norm": 0.4363289177417755, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0358, + "step": 15000 + }, + { + "epoch": 0.42916368834882057, + "grad_norm": 1.314915418624878, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.047, + "step": 15010 + }, + { + "epoch": 0.4294496068620443, + "grad_norm": 0.558199942111969, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0313, + "step": 15020 + }, + { + "epoch": 0.42973552537526805, + "grad_norm": 0.3857463598251343, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0416, + "step": 15030 + }, + { + "epoch": 0.4300214438884918, + "grad_norm": 0.4701749384403229, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0425, + "step": 15040 + }, + { + "epoch": 0.4303073624017155, + "grad_norm": 0.4611213803291321, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0457, + "step": 15050 + }, + { + "epoch": 0.43059328091493926, + "grad_norm": 0.5338016152381897, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.038, + "step": 15060 + }, + { + "epoch": 0.430879199428163, + "grad_norm": 0.9078943133354187, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0395, + "step": 15070 + }, + { + "epoch": 0.4311651179413867, + "grad_norm": 0.5354048013687134, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0403, + "step": 15080 + }, + { + "epoch": 0.4314510364546104, + "grad_norm": 0.35511279106140137, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0377, + "step": 15090 + }, + { + "epoch": 0.43173695496783415, + "grad_norm": 0.37104350328445435, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0426, + "step": 15100 + }, + { + "epoch": 0.4320228734810579, + "grad_norm": 0.8916210532188416, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0387, + "step": 15110 + }, + { + "epoch": 0.4323087919942816, + "grad_norm": 0.514994740486145, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0384, + "step": 15120 + }, + { + "epoch": 0.43259471050750536, + "grad_norm": 0.8440690040588379, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0437, + "step": 15130 + }, + { + "epoch": 0.4328806290207291, + "grad_norm": 0.6815949082374573, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0453, + "step": 15140 + }, + { + "epoch": 0.43316654753395284, + "grad_norm": 0.33178189396858215, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0351, + "step": 15150 + }, + { + "epoch": 0.4334524660471766, + "grad_norm": 0.5686727166175842, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0368, + "step": 15160 + }, + { + "epoch": 0.43373838456040026, + "grad_norm": 0.44143930077552795, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0443, + "step": 15170 + }, + { + "epoch": 0.434024303073624, + "grad_norm": 0.3238232135772705, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0348, + "step": 15180 + }, + { + "epoch": 0.43431022158684773, + "grad_norm": 0.5038242340087891, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0343, + "step": 15190 + }, + { + "epoch": 0.43459614010007147, + "grad_norm": 0.4904351234436035, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0397, + "step": 15200 + }, + { + "epoch": 0.4348820586132952, + "grad_norm": 0.5325750708580017, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0499, + "step": 15210 + }, + { + "epoch": 0.43516797712651895, + "grad_norm": 0.39443954825401306, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 0.4354538956397427, + "grad_norm": 0.6782003045082092, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0358, + "step": 15230 + }, + { + "epoch": 0.4357398141529664, + "grad_norm": 0.47862571477890015, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0418, + "step": 15240 + }, + { + "epoch": 0.43602573266619016, + "grad_norm": 1.6515535116195679, + "learning_rate": 1.124468908014616e-05, + "loss": 0.043, + "step": 15250 + }, + { + "epoch": 0.43631165117941384, + "grad_norm": 0.4902660846710205, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0371, + "step": 15260 + }, + { + "epoch": 0.4365975696926376, + "grad_norm": 0.5742762088775635, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0369, + "step": 15270 + }, + { + "epoch": 0.4368834882058613, + "grad_norm": 0.42058590054512024, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0378, + "step": 15280 + }, + { + "epoch": 0.43716940671908505, + "grad_norm": 0.43729284405708313, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0352, + "step": 15290 + }, + { + "epoch": 0.4374553252323088, + "grad_norm": 0.4689466953277588, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0433, + "step": 15300 + }, + { + "epoch": 0.4377412437455325, + "grad_norm": 0.6272432208061218, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0548, + "step": 15310 + }, + { + "epoch": 0.43802716225875626, + "grad_norm": 1.1129611730575562, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0437, + "step": 15320 + }, + { + "epoch": 0.43831308077198, + "grad_norm": 0.9332655072212219, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0503, + "step": 15330 + }, + { + "epoch": 0.43859899928520374, + "grad_norm": 0.35150477290153503, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0351, + "step": 15340 + }, + { + "epoch": 0.4388849177984274, + "grad_norm": 0.3826565444469452, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0361, + "step": 15350 + }, + { + "epoch": 0.43917083631165116, + "grad_norm": 0.817319393157959, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0352, + "step": 15360 + }, + { + "epoch": 0.4394567548248749, + "grad_norm": 0.4379598796367645, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0469, + "step": 15370 + }, + { + "epoch": 0.43974267333809863, + "grad_norm": 0.6475314497947693, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0456, + "step": 15380 + }, + { + "epoch": 0.44002859185132237, + "grad_norm": 0.529088020324707, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0453, + "step": 15390 + }, + { + "epoch": 0.4403145103645461, + "grad_norm": 0.4915194809436798, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0369, + "step": 15400 + }, + { + "epoch": 0.44060042887776985, + "grad_norm": 0.4766380786895752, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0391, + "step": 15410 + }, + { + "epoch": 0.4408863473909936, + "grad_norm": 0.34667786955833435, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0327, + "step": 15420 + }, + { + "epoch": 0.4411722659042173, + "grad_norm": 0.504242479801178, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0413, + "step": 15430 + }, + { + "epoch": 0.441458184417441, + "grad_norm": 0.49786439538002014, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0361, + "step": 15440 + }, + { + "epoch": 0.44174410293066474, + "grad_norm": 0.4997329115867615, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0368, + "step": 15450 + }, + { + "epoch": 0.4420300214438885, + "grad_norm": 0.2992185056209564, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0359, + "step": 15460 + }, + { + "epoch": 0.4423159399571122, + "grad_norm": 0.6645393371582031, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0401, + "step": 15470 + }, + { + "epoch": 0.44260185847033595, + "grad_norm": 0.6327983140945435, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0386, + "step": 15480 + }, + { + "epoch": 0.4428877769835597, + "grad_norm": 0.45607903599739075, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 0.4431736954967834, + "grad_norm": 0.4401610493659973, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0417, + "step": 15500 + }, + { + "epoch": 0.44345961401000716, + "grad_norm": 0.5778466463088989, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0417, + "step": 15510 + }, + { + "epoch": 0.4437455325232309, + "grad_norm": 0.2164914309978485, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0355, + "step": 15520 + }, + { + "epoch": 0.4440314510364546, + "grad_norm": 0.3869318664073944, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0361, + "step": 15530 + }, + { + "epoch": 0.4443173695496783, + "grad_norm": 0.3843154311180115, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0459, + "step": 15540 + }, + { + "epoch": 0.44460328806290206, + "grad_norm": 0.8488825559616089, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0406, + "step": 15550 + }, + { + "epoch": 0.4448892065761258, + "grad_norm": 0.5055183172225952, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0359, + "step": 15560 + }, + { + "epoch": 0.44517512508934953, + "grad_norm": 0.40923011302948, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0435, + "step": 15570 + }, + { + "epoch": 0.44546104360257327, + "grad_norm": 0.48997730016708374, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0395, + "step": 15580 + }, + { + "epoch": 0.445746962115797, + "grad_norm": 0.5149131417274475, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.041, + "step": 15590 + }, + { + "epoch": 0.44603288062902074, + "grad_norm": 0.7277303338050842, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0452, + "step": 15600 + }, + { + "epoch": 0.4463187991422445, + "grad_norm": 0.48676377534866333, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0363, + "step": 15610 + }, + { + "epoch": 0.44660471765546816, + "grad_norm": 0.49031221866607666, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0356, + "step": 15620 + }, + { + "epoch": 0.4468906361686919, + "grad_norm": 0.38877514004707336, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.036, + "step": 15630 + }, + { + "epoch": 0.44717655468191564, + "grad_norm": 0.570068895816803, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0403, + "step": 15640 + }, + { + "epoch": 0.4474624731951394, + "grad_norm": 0.48499882221221924, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0395, + "step": 15650 + }, + { + "epoch": 0.4477483917083631, + "grad_norm": 0.7251732349395752, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0399, + "step": 15660 + }, + { + "epoch": 0.44803431022158685, + "grad_norm": 0.3927334249019623, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0359, + "step": 15670 + }, + { + "epoch": 0.4483202287348106, + "grad_norm": 0.5614549517631531, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.035, + "step": 15680 + }, + { + "epoch": 0.4486061472480343, + "grad_norm": 0.383831262588501, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0416, + "step": 15690 + }, + { + "epoch": 0.44889206576125806, + "grad_norm": 1.9365276098251343, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0498, + "step": 15700 + }, + { + "epoch": 0.44917798427448175, + "grad_norm": 0.6964924931526184, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.034, + "step": 15710 + }, + { + "epoch": 0.4494639027877055, + "grad_norm": 0.5148108601570129, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0401, + "step": 15720 + }, + { + "epoch": 0.4497498213009292, + "grad_norm": 0.4529317617416382, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0361, + "step": 15730 + }, + { + "epoch": 0.45003573981415296, + "grad_norm": 0.6648512482643127, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0365, + "step": 15740 + }, + { + "epoch": 0.4503216583273767, + "grad_norm": 0.8183113932609558, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0416, + "step": 15750 + }, + { + "epoch": 0.45060757684060043, + "grad_norm": 0.8802638649940491, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0406, + "step": 15760 + }, + { + "epoch": 0.45089349535382417, + "grad_norm": 0.6329004764556885, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0395, + "step": 15770 + }, + { + "epoch": 0.4511794138670479, + "grad_norm": 0.35283520817756653, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0364, + "step": 15780 + }, + { + "epoch": 0.45146533238027164, + "grad_norm": 0.5156061053276062, + "learning_rate": 1.071827766589186e-05, + "loss": 0.031, + "step": 15790 + }, + { + "epoch": 0.4517512508934953, + "grad_norm": 0.37875205278396606, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0375, + "step": 15800 + }, + { + "epoch": 0.45203716940671906, + "grad_norm": 0.5543273687362671, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0421, + "step": 15810 + }, + { + "epoch": 0.4523230879199428, + "grad_norm": 0.3808431923389435, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 0.45260900643316654, + "grad_norm": 0.8648643493652344, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0396, + "step": 15830 + }, + { + "epoch": 0.4528949249463903, + "grad_norm": 0.7893536686897278, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0417, + "step": 15840 + }, + { + "epoch": 0.453180843459614, + "grad_norm": 0.904137134552002, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0384, + "step": 15850 + }, + { + "epoch": 0.45346676197283775, + "grad_norm": 0.6095889806747437, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0457, + "step": 15860 + }, + { + "epoch": 0.4537526804860615, + "grad_norm": 0.5691415667533875, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0438, + "step": 15870 + }, + { + "epoch": 0.4540385989992852, + "grad_norm": 0.37868618965148926, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0414, + "step": 15880 + }, + { + "epoch": 0.4543245175125089, + "grad_norm": 0.7962950468063354, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0405, + "step": 15890 + }, + { + "epoch": 0.45461043602573264, + "grad_norm": 0.8862378597259521, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0475, + "step": 15900 + }, + { + "epoch": 0.4548963545389564, + "grad_norm": 0.8762509822845459, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0472, + "step": 15910 + }, + { + "epoch": 0.4551822730521801, + "grad_norm": 0.6006313562393188, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0417, + "step": 15920 + }, + { + "epoch": 0.45546819156540386, + "grad_norm": 0.3340131938457489, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0374, + "step": 15930 + }, + { + "epoch": 0.4557541100786276, + "grad_norm": 0.2639552056789398, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0387, + "step": 15940 + }, + { + "epoch": 0.45604002859185133, + "grad_norm": 0.42564907670021057, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0376, + "step": 15950 + }, + { + "epoch": 0.45632594710507507, + "grad_norm": 0.503834068775177, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0344, + "step": 15960 + }, + { + "epoch": 0.4566118656182988, + "grad_norm": 0.5962334871292114, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0379, + "step": 15970 + }, + { + "epoch": 0.4568977841315225, + "grad_norm": 0.3271556794643402, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0361, + "step": 15980 + }, + { + "epoch": 0.4571837026447462, + "grad_norm": 0.5501612424850464, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0356, + "step": 15990 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 1.0399914979934692, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.039, + "step": 16000 + }, + { + "epoch": 0.4577555396711937, + "grad_norm": 0.42251288890838623, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0413, + "step": 16010 + }, + { + "epoch": 0.45804145818441744, + "grad_norm": 0.5694882869720459, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0501, + "step": 16020 + }, + { + "epoch": 0.4583273766976412, + "grad_norm": 0.37367814779281616, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0388, + "step": 16030 + }, + { + "epoch": 0.4586132952108649, + "grad_norm": 0.7947224974632263, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0324, + "step": 16040 + }, + { + "epoch": 0.45889921372408865, + "grad_norm": 0.47871798276901245, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0345, + "step": 16050 + }, + { + "epoch": 0.4591851322373124, + "grad_norm": 1.4443609714508057, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0502, + "step": 16060 + }, + { + "epoch": 0.45947105075053607, + "grad_norm": 0.8326191902160645, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0325, + "step": 16070 + }, + { + "epoch": 0.4597569692637598, + "grad_norm": 0.2887400686740875, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.035, + "step": 16080 + }, + { + "epoch": 0.46004288777698354, + "grad_norm": 0.34353405237197876, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0324, + "step": 16090 + }, + { + "epoch": 0.4603288062902073, + "grad_norm": 0.7319850325584412, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0307, + "step": 16100 + }, + { + "epoch": 0.460614724803431, + "grad_norm": 0.6628556847572327, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0398, + "step": 16110 + }, + { + "epoch": 0.46090064331665476, + "grad_norm": 0.39974722266197205, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.038, + "step": 16120 + }, + { + "epoch": 0.4611865618298785, + "grad_norm": 0.7769339680671692, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0425, + "step": 16130 + }, + { + "epoch": 0.46147248034310223, + "grad_norm": 0.6823691129684448, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.039, + "step": 16140 + }, + { + "epoch": 0.46175839885632597, + "grad_norm": 0.6749460697174072, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0388, + "step": 16150 + }, + { + "epoch": 0.46204431736954965, + "grad_norm": 1.0745635032653809, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0406, + "step": 16160 + }, + { + "epoch": 0.4623302358827734, + "grad_norm": 0.8388734459877014, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0345, + "step": 16170 + }, + { + "epoch": 0.4626161543959971, + "grad_norm": 0.675828218460083, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0355, + "step": 16180 + }, + { + "epoch": 0.46290207290922086, + "grad_norm": 0.9872504472732544, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0374, + "step": 16190 + }, + { + "epoch": 0.4631879914224446, + "grad_norm": 0.4705125689506531, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0416, + "step": 16200 + }, + { + "epoch": 0.46347390993566834, + "grad_norm": 0.43577539920806885, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.041, + "step": 16210 + }, + { + "epoch": 0.4637598284488921, + "grad_norm": 0.6472166180610657, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0372, + "step": 16220 + }, + { + "epoch": 0.4640457469621158, + "grad_norm": 1.0108906030654907, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0464, + "step": 16230 + }, + { + "epoch": 0.46433166547533955, + "grad_norm": 0.6221884489059448, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0396, + "step": 16240 + }, + { + "epoch": 0.46461758398856323, + "grad_norm": 0.7375202178955078, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0365, + "step": 16250 + }, + { + "epoch": 0.46490350250178697, + "grad_norm": 0.5090222358703613, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0404, + "step": 16260 + }, + { + "epoch": 0.4651894210150107, + "grad_norm": 0.5641722679138184, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0424, + "step": 16270 + }, + { + "epoch": 0.46547533952823444, + "grad_norm": 0.3946240246295929, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0433, + "step": 16280 + }, + { + "epoch": 0.4657612580414582, + "grad_norm": 0.525059700012207, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0399, + "step": 16290 + }, + { + "epoch": 0.4660471765546819, + "grad_norm": 0.6106441617012024, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0417, + "step": 16300 + }, + { + "epoch": 0.46633309506790566, + "grad_norm": 0.7064299583435059, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0331, + "step": 16310 + }, + { + "epoch": 0.4666190135811294, + "grad_norm": 0.6251654624938965, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0377, + "step": 16320 + }, + { + "epoch": 0.46690493209435313, + "grad_norm": 0.6626482009887695, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0355, + "step": 16330 + }, + { + "epoch": 0.4671908506075768, + "grad_norm": 0.32827794551849365, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0438, + "step": 16340 + }, + { + "epoch": 0.46747676912080055, + "grad_norm": 1.147644281387329, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.041, + "step": 16350 + }, + { + "epoch": 0.4677626876340243, + "grad_norm": 0.5785626769065857, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0362, + "step": 16360 + }, + { + "epoch": 0.468048606147248, + "grad_norm": 0.7087936401367188, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0364, + "step": 16370 + }, + { + "epoch": 0.46833452466047176, + "grad_norm": 0.7729533314704895, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0357, + "step": 16380 + }, + { + "epoch": 0.4686204431736955, + "grad_norm": 0.9080077409744263, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0445, + "step": 16390 + }, + { + "epoch": 0.46890636168691924, + "grad_norm": 0.5273067355155945, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0395, + "step": 16400 + }, + { + "epoch": 0.469192280200143, + "grad_norm": 0.4801991581916809, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0469, + "step": 16410 + }, + { + "epoch": 0.4694781987133667, + "grad_norm": 0.38060688972473145, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0377, + "step": 16420 + }, + { + "epoch": 0.4697641172265904, + "grad_norm": 1.335648536682129, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0444, + "step": 16430 + }, + { + "epoch": 0.47005003573981413, + "grad_norm": 0.6224690079689026, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0365, + "step": 16440 + }, + { + "epoch": 0.47033595425303787, + "grad_norm": 0.39938899874687195, + "learning_rate": 1.007637577910799e-05, + "loss": 0.037, + "step": 16450 + }, + { + "epoch": 0.4706218727662616, + "grad_norm": 0.47899872064590454, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0371, + "step": 16460 + }, + { + "epoch": 0.47090779127948534, + "grad_norm": 0.8991144895553589, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0337, + "step": 16470 + }, + { + "epoch": 0.4711937097927091, + "grad_norm": 0.6228598356246948, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0388, + "step": 16480 + }, + { + "epoch": 0.4714796283059328, + "grad_norm": 0.41108259558677673, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0378, + "step": 16490 + }, + { + "epoch": 0.47176554681915656, + "grad_norm": 0.722955048084259, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0381, + "step": 16500 + }, + { + "epoch": 0.4720514653323803, + "grad_norm": 0.6090973019599915, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0348, + "step": 16510 + }, + { + "epoch": 0.472337383845604, + "grad_norm": 0.483549565076828, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0456, + "step": 16520 + }, + { + "epoch": 0.4726233023588277, + "grad_norm": 0.4134727418422699, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0444, + "step": 16530 + }, + { + "epoch": 0.47290922087205145, + "grad_norm": 0.4629753530025482, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0382, + "step": 16540 + }, + { + "epoch": 0.4731951393852752, + "grad_norm": 0.8709504008293152, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0384, + "step": 16550 + }, + { + "epoch": 0.4734810578984989, + "grad_norm": 0.683397114276886, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0398, + "step": 16560 + }, + { + "epoch": 0.47376697641172266, + "grad_norm": 0.5743465423583984, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0431, + "step": 16570 + }, + { + "epoch": 0.4740528949249464, + "grad_norm": 1.0080480575561523, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0378, + "step": 16580 + }, + { + "epoch": 0.47433881343817014, + "grad_norm": 0.4668700098991394, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0369, + "step": 16590 + }, + { + "epoch": 0.4746247319513939, + "grad_norm": 0.6005896925926208, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0508, + "step": 16600 + }, + { + "epoch": 0.47491065046461756, + "grad_norm": 0.5788530707359314, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0354, + "step": 16610 + }, + { + "epoch": 0.4751965689778413, + "grad_norm": 0.38784441351890564, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0357, + "step": 16620 + }, + { + "epoch": 0.47548248749106503, + "grad_norm": 0.4809567928314209, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0331, + "step": 16630 + }, + { + "epoch": 0.47576840600428877, + "grad_norm": 0.6647809147834778, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0473, + "step": 16640 + }, + { + "epoch": 0.4760543245175125, + "grad_norm": 0.3968522548675537, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0304, + "step": 16650 + }, + { + "epoch": 0.47634024303073624, + "grad_norm": 0.3258526027202606, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0387, + "step": 16660 + }, + { + "epoch": 0.47662616154396, + "grad_norm": 0.43442079424858093, + "learning_rate": 9.863295834019308e-06, + "loss": 0.04, + "step": 16670 + }, + { + "epoch": 0.4769120800571837, + "grad_norm": 0.36909565329551697, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0351, + "step": 16680 + }, + { + "epoch": 0.47719799857040746, + "grad_norm": 0.5566768050193787, + "learning_rate": 9.843955128197274e-06, + "loss": 0.031, + "step": 16690 + }, + { + "epoch": 0.47748391708363114, + "grad_norm": 0.5705142617225647, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0359, + "step": 16700 + }, + { + "epoch": 0.4777698355968549, + "grad_norm": 0.28931716084480286, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0407, + "step": 16710 + }, + { + "epoch": 0.4780557541100786, + "grad_norm": 0.5509498715400696, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0363, + "step": 16720 + }, + { + "epoch": 0.47834167262330235, + "grad_norm": 0.3564346432685852, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0364, + "step": 16730 + }, + { + "epoch": 0.4786275911365261, + "grad_norm": 0.32734423875808716, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0369, + "step": 16740 + }, + { + "epoch": 0.4789135096497498, + "grad_norm": 0.3048594892024994, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0367, + "step": 16750 + }, + { + "epoch": 0.47919942816297356, + "grad_norm": 0.9007049798965454, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0377, + "step": 16760 + }, + { + "epoch": 0.4794853466761973, + "grad_norm": 0.7010983824729919, + "learning_rate": 9.76664747972605e-06, + "loss": 0.039, + "step": 16770 + }, + { + "epoch": 0.47977126518942104, + "grad_norm": 0.644473135471344, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0466, + "step": 16780 + }, + { + "epoch": 0.4800571837026447, + "grad_norm": 0.6333492398262024, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0373, + "step": 16790 + }, + { + "epoch": 0.48034310221586846, + "grad_norm": 0.5148355960845947, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0392, + "step": 16800 + }, + { + "epoch": 0.4806290207290922, + "grad_norm": 0.7288355231285095, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0381, + "step": 16810 + }, + { + "epoch": 0.48091493924231593, + "grad_norm": 0.3674873113632202, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0418, + "step": 16820 + }, + { + "epoch": 0.48120085775553967, + "grad_norm": 0.5055420398712158, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0336, + "step": 16830 + }, + { + "epoch": 0.4814867762687634, + "grad_norm": 0.641754686832428, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0342, + "step": 16840 + }, + { + "epoch": 0.48177269478198714, + "grad_norm": 0.308200478553772, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0364, + "step": 16850 + }, + { + "epoch": 0.4820586132952109, + "grad_norm": 0.41361021995544434, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0342, + "step": 16860 + }, + { + "epoch": 0.4823445318084346, + "grad_norm": 0.45777833461761475, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0353, + "step": 16870 + }, + { + "epoch": 0.4826304503216583, + "grad_norm": 0.7587664723396301, + "learning_rate": 9.660501900166734e-06, + "loss": 0.043, + "step": 16880 + }, + { + "epoch": 0.48291636883488204, + "grad_norm": 0.8740283250808716, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0372, + "step": 16890 + }, + { + "epoch": 0.4832022873481058, + "grad_norm": 0.3009270429611206, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0373, + "step": 16900 + }, + { + "epoch": 0.4834882058613295, + "grad_norm": 0.4439285695552826, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0349, + "step": 16910 + }, + { + "epoch": 0.48377412437455325, + "grad_norm": 0.39849671721458435, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0394, + "step": 16920 + }, + { + "epoch": 0.484060042887777, + "grad_norm": 0.6423043608665466, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0413, + "step": 16930 + }, + { + "epoch": 0.4843459614010007, + "grad_norm": 0.3683928847312927, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0387, + "step": 16940 + }, + { + "epoch": 0.48463187991422446, + "grad_norm": 0.7087769508361816, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0397, + "step": 16950 + }, + { + "epoch": 0.4849177984274482, + "grad_norm": 0.5348120927810669, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0405, + "step": 16960 + }, + { + "epoch": 0.4852037169406719, + "grad_norm": 0.549891471862793, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0363, + "step": 16970 + }, + { + "epoch": 0.4854896354538956, + "grad_norm": 0.7177272439002991, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0343, + "step": 16980 + }, + { + "epoch": 0.48577555396711936, + "grad_norm": 0.595417320728302, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0439, + "step": 16990 + }, + { + "epoch": 0.4860614724803431, + "grad_norm": 0.4838889241218567, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0387, + "step": 17000 + }, + { + "epoch": 0.48634739099356683, + "grad_norm": 0.6186223030090332, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0362, + "step": 17010 + }, + { + "epoch": 0.48663330950679057, + "grad_norm": 0.43383121490478516, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0381, + "step": 17020 + }, + { + "epoch": 0.4869192280200143, + "grad_norm": 0.6735527515411377, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0388, + "step": 17030 + }, + { + "epoch": 0.48720514653323804, + "grad_norm": 0.3746320605278015, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0491, + "step": 17040 + }, + { + "epoch": 0.4874910650464618, + "grad_norm": 0.29500988125801086, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0395, + "step": 17050 + }, + { + "epoch": 0.48777698355968546, + "grad_norm": 0.8518465757369995, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0435, + "step": 17060 + }, + { + "epoch": 0.4880629020729092, + "grad_norm": 0.9653190970420837, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0393, + "step": 17070 + }, + { + "epoch": 0.48834882058613294, + "grad_norm": 0.785724937915802, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0372, + "step": 17080 + }, + { + "epoch": 0.4886347390993567, + "grad_norm": 0.9450638890266418, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0406, + "step": 17090 + }, + { + "epoch": 0.4889206576125804, + "grad_norm": 0.645124077796936, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0361, + "step": 17100 + }, + { + "epoch": 0.48920657612580415, + "grad_norm": 0.3352372944355011, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0417, + "step": 17110 + }, + { + "epoch": 0.4894924946390279, + "grad_norm": 0.3858814835548401, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0345, + "step": 17120 + }, + { + "epoch": 0.4897784131522516, + "grad_norm": 0.5403604507446289, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0326, + "step": 17130 + }, + { + "epoch": 0.49006433166547536, + "grad_norm": 0.6986777782440186, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0417, + "step": 17140 + }, + { + "epoch": 0.49035025017869904, + "grad_norm": 0.5456675887107849, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0473, + "step": 17150 + }, + { + "epoch": 0.4906361686919228, + "grad_norm": 0.3961554765701294, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0341, + "step": 17160 + }, + { + "epoch": 0.4909220872051465, + "grad_norm": 0.5188277363777161, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0369, + "step": 17170 + }, + { + "epoch": 0.49120800571837026, + "grad_norm": 0.6042230725288391, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0352, + "step": 17180 + }, + { + "epoch": 0.491493924231594, + "grad_norm": 0.5485941171646118, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0405, + "step": 17190 + }, + { + "epoch": 0.49177984274481773, + "grad_norm": 0.5856509804725647, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0402, + "step": 17200 + }, + { + "epoch": 0.49206576125804147, + "grad_norm": 0.8656556010246277, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0349, + "step": 17210 + }, + { + "epoch": 0.4923516797712652, + "grad_norm": 0.4041757583618164, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0364, + "step": 17220 + }, + { + "epoch": 0.49263759828448894, + "grad_norm": 0.6135975122451782, + "learning_rate": 9.324104146177972e-06, + "loss": 0.036, + "step": 17230 + }, + { + "epoch": 0.4929235167977126, + "grad_norm": 0.5101860165596008, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0359, + "step": 17240 + }, + { + "epoch": 0.49320943531093636, + "grad_norm": 0.9913426041603088, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0552, + "step": 17250 + }, + { + "epoch": 0.4934953538241601, + "grad_norm": 0.6148158311843872, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0388, + "step": 17260 + }, + { + "epoch": 0.49378127233738384, + "grad_norm": 0.6651721596717834, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0374, + "step": 17270 + }, + { + "epoch": 0.4940671908506076, + "grad_norm": 0.9545061588287354, + "learning_rate": 9.276232738281744e-06, + "loss": 0.035, + "step": 17280 + }, + { + "epoch": 0.4943531093638313, + "grad_norm": 0.8923225402832031, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0366, + "step": 17290 + }, + { + "epoch": 0.49463902787705505, + "grad_norm": 0.5337848663330078, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0354, + "step": 17300 + }, + { + "epoch": 0.4949249463902788, + "grad_norm": 0.35039281845092773, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0341, + "step": 17310 + }, + { + "epoch": 0.4952108649035025, + "grad_norm": 0.47406911849975586, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0393, + "step": 17320 + }, + { + "epoch": 0.4954967834167262, + "grad_norm": 0.6226631999015808, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0375, + "step": 17330 + }, + { + "epoch": 0.49578270192994994, + "grad_norm": 0.6652712821960449, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0363, + "step": 17340 + }, + { + "epoch": 0.4960686204431737, + "grad_norm": 1.0042835474014282, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0368, + "step": 17350 + }, + { + "epoch": 0.4963545389563974, + "grad_norm": 0.4334045648574829, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0375, + "step": 17360 + }, + { + "epoch": 0.49664045746962115, + "grad_norm": 0.3561633229255676, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0347, + "step": 17370 + }, + { + "epoch": 0.4969263759828449, + "grad_norm": 0.5763550996780396, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0344, + "step": 17380 + }, + { + "epoch": 0.49721229449606863, + "grad_norm": 0.6306643486022949, + "learning_rate": 9.171095634265995e-06, + "loss": 0.037, + "step": 17390 + }, + { + "epoch": 0.49749821300929237, + "grad_norm": 0.4286569058895111, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0308, + "step": 17400 + }, + { + "epoch": 0.4977841315225161, + "grad_norm": 0.577983558177948, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0393, + "step": 17410 + }, + { + "epoch": 0.4980700500357398, + "grad_norm": 0.5714932084083557, + "learning_rate": 9.142466323573853e-06, + "loss": 0.038, + "step": 17420 + }, + { + "epoch": 0.4983559685489635, + "grad_norm": 0.7529498338699341, + "learning_rate": 9.132927564918328e-06, + "loss": 0.033, + "step": 17430 + }, + { + "epoch": 0.49864188706218726, + "grad_norm": 0.5179672241210938, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0367, + "step": 17440 + }, + { + "epoch": 0.498927805575411, + "grad_norm": 0.38424569368362427, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0401, + "step": 17450 + }, + { + "epoch": 0.49921372408863474, + "grad_norm": 0.469460129737854, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0379, + "step": 17460 + }, + { + "epoch": 0.4994996426018585, + "grad_norm": 0.3285387456417084, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0399, + "step": 17470 + }, + { + "epoch": 0.4997855611150822, + "grad_norm": 0.49863550066947937, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0313, + "step": 17480 + }, + { + "epoch": 0.5000714796283059, + "grad_norm": 0.3926186263561249, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0454, + "step": 17490 + }, + { + "epoch": 0.5003573981415297, + "grad_norm": 0.4476146399974823, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0472, + "step": 17500 + }, + { + "epoch": 0.5006433166547534, + "grad_norm": 0.5645599961280823, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0358, + "step": 17510 + }, + { + "epoch": 0.5009292351679772, + "grad_norm": 0.4813307225704193, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0364, + "step": 17520 + }, + { + "epoch": 0.5012151536812008, + "grad_norm": 0.49410971999168396, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0303, + "step": 17530 + }, + { + "epoch": 0.5015010721944246, + "grad_norm": 0.7172105312347412, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0404, + "step": 17540 + }, + { + "epoch": 0.5017869907076483, + "grad_norm": 0.43401873111724854, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0402, + "step": 17550 + }, + { + "epoch": 0.502072909220872, + "grad_norm": 0.6497406363487244, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0364, + "step": 17560 + }, + { + "epoch": 0.5023588277340958, + "grad_norm": 0.44618356227874756, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0337, + "step": 17570 + }, + { + "epoch": 0.5026447462473195, + "grad_norm": 0.4186992049217224, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0381, + "step": 17580 + }, + { + "epoch": 0.5029306647605433, + "grad_norm": 0.7387974858283997, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0319, + "step": 17590 + }, + { + "epoch": 0.503216583273767, + "grad_norm": 0.8068642020225525, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0373, + "step": 17600 + }, + { + "epoch": 0.5035025017869907, + "grad_norm": 0.5773473978042603, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0372, + "step": 17610 + }, + { + "epoch": 0.5037884203002144, + "grad_norm": 0.32488778233528137, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0334, + "step": 17620 + }, + { + "epoch": 0.5040743388134382, + "grad_norm": 0.33978500962257385, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0493, + "step": 17630 + }, + { + "epoch": 0.5043602573266619, + "grad_norm": 0.5897071361541748, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0335, + "step": 17640 + }, + { + "epoch": 0.5046461758398856, + "grad_norm": 0.6275895833969116, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0395, + "step": 17650 + }, + { + "epoch": 0.5049320943531094, + "grad_norm": 0.7995536923408508, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0422, + "step": 17660 + }, + { + "epoch": 0.505218012866333, + "grad_norm": 0.8734716773033142, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0414, + "step": 17670 + }, + { + "epoch": 0.5055039313795568, + "grad_norm": 0.6239343881607056, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0333, + "step": 17680 + }, + { + "epoch": 0.5057898498927805, + "grad_norm": 0.42508623003959656, + "learning_rate": 8.885721609997551e-06, + "loss": 0.045, + "step": 17690 + }, + { + "epoch": 0.5060757684060043, + "grad_norm": 0.4272485673427582, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0506, + "step": 17700 + }, + { + "epoch": 0.506361686919228, + "grad_norm": 0.8006368279457092, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0431, + "step": 17710 + }, + { + "epoch": 0.5066476054324518, + "grad_norm": 0.5896835327148438, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0322, + "step": 17720 + }, + { + "epoch": 0.5069335239456755, + "grad_norm": 0.6880389451980591, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0322, + "step": 17730 + }, + { + "epoch": 0.5072194424588992, + "grad_norm": 1.4850202798843384, + "learning_rate": 8.83836825410936e-06, + "loss": 0.052, + "step": 17740 + }, + { + "epoch": 0.507505360972123, + "grad_norm": 0.7684240937232971, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0353, + "step": 17750 + }, + { + "epoch": 0.5077912794853466, + "grad_norm": 0.5456307530403137, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0419, + "step": 17760 + }, + { + "epoch": 0.5080771979985704, + "grad_norm": 0.5775120258331299, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0366, + "step": 17770 + }, + { + "epoch": 0.5083631165117941, + "grad_norm": 0.6453070044517517, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0341, + "step": 17780 + }, + { + "epoch": 0.5086490350250179, + "grad_norm": 0.7906973361968994, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0405, + "step": 17790 + }, + { + "epoch": 0.5089349535382416, + "grad_norm": 1.0740606784820557, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0344, + "step": 17800 + }, + { + "epoch": 0.5092208720514654, + "grad_norm": 0.41854357719421387, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0334, + "step": 17810 + }, + { + "epoch": 0.5095067905646891, + "grad_norm": 0.6328964233398438, + "learning_rate": 8.762735374981932e-06, + "loss": 0.036, + "step": 17820 + }, + { + "epoch": 0.5097927090779127, + "grad_norm": 0.40875789523124695, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0338, + "step": 17830 + }, + { + "epoch": 0.5100786275911365, + "grad_norm": 0.5056312084197998, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0332, + "step": 17840 + }, + { + "epoch": 0.5103645461043602, + "grad_norm": 0.5005037784576416, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0416, + "step": 17850 + }, + { + "epoch": 0.510650464617584, + "grad_norm": 0.5689167380332947, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0329, + "step": 17860 + }, + { + "epoch": 0.5109363831308077, + "grad_norm": 0.5222717523574829, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0336, + "step": 17870 + }, + { + "epoch": 0.5112223016440315, + "grad_norm": 0.5998329520225525, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0354, + "step": 17880 + }, + { + "epoch": 0.5115082201572552, + "grad_norm": 0.4684480130672455, + "learning_rate": 8.69669425266315e-06, + "loss": 0.05, + "step": 17890 + }, + { + "epoch": 0.511794138670479, + "grad_norm": 0.4061124622821808, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0384, + "step": 17900 + }, + { + "epoch": 0.5120800571837026, + "grad_norm": 0.5025928020477295, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0386, + "step": 17910 + }, + { + "epoch": 0.5123659756969263, + "grad_norm": 0.3731222152709961, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0378, + "step": 17920 + }, + { + "epoch": 0.5126518942101501, + "grad_norm": 0.7784973978996277, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0419, + "step": 17930 + }, + { + "epoch": 0.5129378127233738, + "grad_norm": 0.7074074745178223, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0386, + "step": 17940 + }, + { + "epoch": 0.5132237312365976, + "grad_norm": 0.49802306294441223, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0418, + "step": 17950 + }, + { + "epoch": 0.5135096497498213, + "grad_norm": 0.4355427920818329, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0431, + "step": 17960 + }, + { + "epoch": 0.5137955682630451, + "grad_norm": 0.672635555267334, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0403, + "step": 17970 + }, + { + "epoch": 0.5140814867762687, + "grad_norm": 0.6733908653259277, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0487, + "step": 17980 + }, + { + "epoch": 0.5143674052894925, + "grad_norm": 0.43711504340171814, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0378, + "step": 17990 + }, + { + "epoch": 0.5146533238027162, + "grad_norm": 0.6371222138404846, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0336, + "step": 18000 + }, + { + "epoch": 0.5149392423159399, + "grad_norm": 0.8007041811943054, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0371, + "step": 18010 + }, + { + "epoch": 0.5152251608291637, + "grad_norm": 0.4725078344345093, + "learning_rate": 8.574400723012433e-06, + "loss": 0.037, + "step": 18020 + }, + { + "epoch": 0.5155110793423874, + "grad_norm": 0.34229791164398193, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0353, + "step": 18030 + }, + { + "epoch": 0.5157969978556112, + "grad_norm": 0.27863454818725586, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0371, + "step": 18040 + }, + { + "epoch": 0.5160829163688349, + "grad_norm": 0.43021920323371887, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0419, + "step": 18050 + }, + { + "epoch": 0.5163688348820586, + "grad_norm": 0.4683758318424225, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0307, + "step": 18060 + }, + { + "epoch": 0.5166547533952823, + "grad_norm": 0.29085367918014526, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0372, + "step": 18070 + }, + { + "epoch": 0.5169406719085061, + "grad_norm": 0.4396727681159973, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0328, + "step": 18080 + }, + { + "epoch": 0.5172265904217298, + "grad_norm": 0.539021372795105, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0317, + "step": 18090 + }, + { + "epoch": 0.5175125089349535, + "grad_norm": 0.556974470615387, + "learning_rate": 8.499380733111628e-06, + "loss": 0.037, + "step": 18100 + }, + { + "epoch": 0.5177984274481773, + "grad_norm": 0.4445747137069702, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0344, + "step": 18110 + }, + { + "epoch": 0.518084345961401, + "grad_norm": 0.3742713928222656, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0339, + "step": 18120 + }, + { + "epoch": 0.5183702644746248, + "grad_norm": 0.8467416167259216, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0409, + "step": 18130 + }, + { + "epoch": 0.5186561829878484, + "grad_norm": 0.7731484770774841, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0379, + "step": 18140 + }, + { + "epoch": 0.5189421015010722, + "grad_norm": 0.5664084553718567, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0353, + "step": 18150 + }, + { + "epoch": 0.5192280200142959, + "grad_norm": 0.5623966455459595, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0412, + "step": 18160 + }, + { + "epoch": 0.5195139385275197, + "grad_norm": 0.5074556469917297, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0402, + "step": 18170 + }, + { + "epoch": 0.5197998570407434, + "grad_norm": 0.49439728260040283, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0341, + "step": 18180 + }, + { + "epoch": 0.5200857755539671, + "grad_norm": 0.5982527136802673, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0308, + "step": 18190 + }, + { + "epoch": 0.5203716940671909, + "grad_norm": 0.7891598343849182, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0437, + "step": 18200 + }, + { + "epoch": 0.5206576125804145, + "grad_norm": 0.7565666437149048, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0381, + "step": 18210 + }, + { + "epoch": 0.5209435310936383, + "grad_norm": 0.33346351981163025, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0454, + "step": 18220 + }, + { + "epoch": 0.521229449606862, + "grad_norm": 0.5885659456253052, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0413, + "step": 18230 + }, + { + "epoch": 0.5215153681200858, + "grad_norm": 0.6487091183662415, + "learning_rate": 8.368551060444755e-06, + "loss": 0.035, + "step": 18240 + }, + { + "epoch": 0.5218012866333095, + "grad_norm": 0.9817430377006531, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0394, + "step": 18250 + }, + { + "epoch": 0.5220872051465333, + "grad_norm": 0.5691193342208862, + "learning_rate": 8.349909816537207e-06, + "loss": 0.041, + "step": 18260 + }, + { + "epoch": 0.522373123659757, + "grad_norm": 0.5326661467552185, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0361, + "step": 18270 + }, + { + "epoch": 0.5226590421729806, + "grad_norm": 0.5536142587661743, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0406, + "step": 18280 + }, + { + "epoch": 0.5229449606862044, + "grad_norm": 0.3482394218444824, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0423, + "step": 18290 + }, + { + "epoch": 0.5232308791994281, + "grad_norm": 0.514914333820343, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0352, + "step": 18300 + }, + { + "epoch": 0.5235167977126519, + "grad_norm": 0.7681404948234558, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0386, + "step": 18310 + }, + { + "epoch": 0.5238027162258756, + "grad_norm": 0.400426983833313, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0333, + "step": 18320 + }, + { + "epoch": 0.5240886347390994, + "grad_norm": 0.4996081590652466, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0381, + "step": 18330 + }, + { + "epoch": 0.5243745532523231, + "grad_norm": 0.5379085540771484, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0348, + "step": 18340 + }, + { + "epoch": 0.5246604717655469, + "grad_norm": 0.4462053179740906, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0307, + "step": 18350 + }, + { + "epoch": 0.5249463902787705, + "grad_norm": 0.7336096167564392, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0345, + "step": 18360 + }, + { + "epoch": 0.5252323087919942, + "grad_norm": 0.6676360368728638, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0346, + "step": 18370 + }, + { + "epoch": 0.525518227305218, + "grad_norm": 0.46608656644821167, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0334, + "step": 18380 + }, + { + "epoch": 0.5258041458184417, + "grad_norm": 0.4906940460205078, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0331, + "step": 18390 + }, + { + "epoch": 0.5260900643316655, + "grad_norm": 0.4200032353401184, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0394, + "step": 18400 + }, + { + "epoch": 0.5263759828448892, + "grad_norm": 0.5663877725601196, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0349, + "step": 18410 + }, + { + "epoch": 0.526661901358113, + "grad_norm": 0.36824384331703186, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0303, + "step": 18420 + }, + { + "epoch": 0.5269478198713367, + "grad_norm": 0.8120076060295105, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0443, + "step": 18430 + }, + { + "epoch": 0.5272337383845604, + "grad_norm": 0.4102472960948944, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0369, + "step": 18440 + }, + { + "epoch": 0.5275196568977841, + "grad_norm": 0.5186526775360107, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0338, + "step": 18450 + }, + { + "epoch": 0.5278055754110078, + "grad_norm": 0.9650108218193054, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0343, + "step": 18460 + }, + { + "epoch": 0.5280914939242316, + "grad_norm": 0.5894375443458557, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0416, + "step": 18470 + }, + { + "epoch": 0.5283774124374553, + "grad_norm": 0.6188816428184509, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0402, + "step": 18480 + }, + { + "epoch": 0.5286633309506791, + "grad_norm": 0.35280847549438477, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0363, + "step": 18490 + }, + { + "epoch": 0.5289492494639028, + "grad_norm": 0.7289313673973083, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0392, + "step": 18500 + }, + { + "epoch": 0.5292351679771266, + "grad_norm": 0.505050778388977, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0329, + "step": 18510 + }, + { + "epoch": 0.5295210864903502, + "grad_norm": 0.7029705047607422, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0344, + "step": 18520 + }, + { + "epoch": 0.529807005003574, + "grad_norm": 0.2958471477031708, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0431, + "step": 18530 + }, + { + "epoch": 0.5300929235167977, + "grad_norm": 0.9649683237075806, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0329, + "step": 18540 + }, + { + "epoch": 0.5303788420300214, + "grad_norm": 0.24733735620975494, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0354, + "step": 18550 + }, + { + "epoch": 0.5306647605432452, + "grad_norm": 0.44838136434555054, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0334, + "step": 18560 + }, + { + "epoch": 0.5309506790564689, + "grad_norm": 0.4505597949028015, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0338, + "step": 18570 + }, + { + "epoch": 0.5312365975696927, + "grad_norm": 0.44188442826271057, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0326, + "step": 18580 + }, + { + "epoch": 0.5315225160829163, + "grad_norm": 0.4539152979850769, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0369, + "step": 18590 + }, + { + "epoch": 0.5318084345961401, + "grad_norm": 0.8311023712158203, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0441, + "step": 18600 + }, + { + "epoch": 0.5320943531093638, + "grad_norm": 0.53764808177948, + "learning_rate": 8.025779439806006e-06, + "loss": 0.037, + "step": 18610 + }, + { + "epoch": 0.5323802716225876, + "grad_norm": 1.2192102670669556, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0369, + "step": 18620 + }, + { + "epoch": 0.5326661901358113, + "grad_norm": 0.5254611968994141, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0371, + "step": 18630 + }, + { + "epoch": 0.532952108649035, + "grad_norm": 0.585709810256958, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0337, + "step": 18640 + }, + { + "epoch": 0.5332380271622588, + "grad_norm": 0.45416259765625, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0425, + "step": 18650 + }, + { + "epoch": 0.5335239456754824, + "grad_norm": 0.3957739472389221, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0354, + "step": 18660 + }, + { + "epoch": 0.5338098641887062, + "grad_norm": 0.6211117506027222, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0347, + "step": 18670 + }, + { + "epoch": 0.5340957827019299, + "grad_norm": 0.49023327231407166, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0308, + "step": 18680 + }, + { + "epoch": 0.5343817012151537, + "grad_norm": 0.5823351144790649, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0351, + "step": 18690 + }, + { + "epoch": 0.5346676197283774, + "grad_norm": 0.6048677563667297, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0382, + "step": 18700 + }, + { + "epoch": 0.5349535382416012, + "grad_norm": 0.5293828845024109, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0329, + "step": 18710 + }, + { + "epoch": 0.5352394567548249, + "grad_norm": 0.5935509204864502, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0388, + "step": 18720 + }, + { + "epoch": 0.5355253752680486, + "grad_norm": 0.8369598388671875, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0421, + "step": 18730 + }, + { + "epoch": 0.5358112937812723, + "grad_norm": 0.6874870657920837, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0385, + "step": 18740 + }, + { + "epoch": 0.536097212294496, + "grad_norm": 0.43511492013931274, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0405, + "step": 18750 + }, + { + "epoch": 0.5363831308077198, + "grad_norm": 0.662755012512207, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0375, + "step": 18760 + }, + { + "epoch": 0.5366690493209435, + "grad_norm": 0.5519852638244629, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0351, + "step": 18770 + }, + { + "epoch": 0.5369549678341673, + "grad_norm": 0.9711637496948242, + "learning_rate": 7.869858673101027e-06, + "loss": 0.038, + "step": 18780 + }, + { + "epoch": 0.537240886347391, + "grad_norm": 0.4944411516189575, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0416, + "step": 18790 + }, + { + "epoch": 0.5375268048606148, + "grad_norm": 0.5257377624511719, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0349, + "step": 18800 + }, + { + "epoch": 0.5378127233738385, + "grad_norm": 0.4833063781261444, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0414, + "step": 18810 + }, + { + "epoch": 0.5380986418870621, + "grad_norm": 0.4496164917945862, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0369, + "step": 18820 + }, + { + "epoch": 0.5383845604002859, + "grad_norm": 0.6939138174057007, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0337, + "step": 18830 + }, + { + "epoch": 0.5386704789135096, + "grad_norm": 0.32579538226127625, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0371, + "step": 18840 + }, + { + "epoch": 0.5389563974267334, + "grad_norm": 0.35594654083251953, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0366, + "step": 18850 + }, + { + "epoch": 0.5392423159399571, + "grad_norm": 0.6114012002944946, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0372, + "step": 18860 + }, + { + "epoch": 0.5395282344531809, + "grad_norm": 0.8492457270622253, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0346, + "step": 18870 + }, + { + "epoch": 0.5398141529664046, + "grad_norm": 0.5214036703109741, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0388, + "step": 18880 + }, + { + "epoch": 0.5401000714796284, + "grad_norm": 0.428671658039093, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0394, + "step": 18890 + }, + { + "epoch": 0.540385989992852, + "grad_norm": 0.6071562767028809, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0371, + "step": 18900 + }, + { + "epoch": 0.5406719085060757, + "grad_norm": 0.41996505856513977, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0334, + "step": 18910 + }, + { + "epoch": 0.5409578270192995, + "grad_norm": 0.5260844826698303, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0362, + "step": 18920 + }, + { + "epoch": 0.5412437455325232, + "grad_norm": 0.43362122774124146, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0325, + "step": 18930 + }, + { + "epoch": 0.541529664045747, + "grad_norm": 0.4597149193286896, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0345, + "step": 18940 + }, + { + "epoch": 0.5418155825589707, + "grad_norm": 0.6667322516441345, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0421, + "step": 18950 + }, + { + "epoch": 0.5421015010721945, + "grad_norm": 0.8998900651931763, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0368, + "step": 18960 + }, + { + "epoch": 0.5423874195854181, + "grad_norm": 0.5075538158416748, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0327, + "step": 18970 + }, + { + "epoch": 0.5426733380986419, + "grad_norm": 0.38445526361465454, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0446, + "step": 18980 + }, + { + "epoch": 0.5429592566118656, + "grad_norm": 0.696186363697052, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0364, + "step": 18990 + }, + { + "epoch": 0.5432451751250893, + "grad_norm": 0.6371187567710876, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0401, + "step": 19000 + }, + { + "epoch": 0.5435310936383131, + "grad_norm": 0.6122881174087524, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0345, + "step": 19010 + }, + { + "epoch": 0.5438170121515368, + "grad_norm": 0.4222267270088196, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0456, + "step": 19020 + }, + { + "epoch": 0.5441029306647606, + "grad_norm": 0.6122517585754395, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0434, + "step": 19030 + }, + { + "epoch": 0.5443888491779842, + "grad_norm": 0.2783992886543274, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0354, + "step": 19040 + }, + { + "epoch": 0.544674767691208, + "grad_norm": 0.6433000564575195, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0321, + "step": 19050 + }, + { + "epoch": 0.5449606862044317, + "grad_norm": 0.6967030167579651, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0394, + "step": 19060 + }, + { + "epoch": 0.5452466047176555, + "grad_norm": 0.4799044132232666, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0329, + "step": 19070 + }, + { + "epoch": 0.5455325232308792, + "grad_norm": 0.633895993232727, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0316, + "step": 19080 + }, + { + "epoch": 0.5458184417441029, + "grad_norm": 0.5601945519447327, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0449, + "step": 19090 + }, + { + "epoch": 0.5461043602573267, + "grad_norm": 0.4917007088661194, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0351, + "step": 19100 + }, + { + "epoch": 0.5463902787705504, + "grad_norm": 0.4813363254070282, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.029, + "step": 19110 + }, + { + "epoch": 0.5466761972837741, + "grad_norm": 0.5359676480293274, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0346, + "step": 19120 + }, + { + "epoch": 0.5469621157969978, + "grad_norm": 0.6500958204269409, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0374, + "step": 19130 + }, + { + "epoch": 0.5472480343102216, + "grad_norm": 0.7708510756492615, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0332, + "step": 19140 + }, + { + "epoch": 0.5475339528234453, + "grad_norm": 0.45693230628967285, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0344, + "step": 19150 + }, + { + "epoch": 0.5478198713366691, + "grad_norm": 0.6046226620674133, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0342, + "step": 19160 + }, + { + "epoch": 0.5481057898498928, + "grad_norm": 0.5253175497055054, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0449, + "step": 19170 + }, + { + "epoch": 0.5483917083631165, + "grad_norm": 0.3790060877799988, + "learning_rate": 7.507267205473318e-06, + "loss": 0.037, + "step": 19180 + }, + { + "epoch": 0.5486776268763403, + "grad_norm": 0.37709203362464905, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0346, + "step": 19190 + }, + { + "epoch": 0.5489635453895639, + "grad_norm": 0.3940931558609009, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0427, + "step": 19200 + }, + { + "epoch": 0.5492494639027877, + "grad_norm": 0.761299192905426, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0353, + "step": 19210 + }, + { + "epoch": 0.5495353824160114, + "grad_norm": 0.5268495082855225, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0328, + "step": 19220 + }, + { + "epoch": 0.5498213009292352, + "grad_norm": 0.45624151825904846, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0353, + "step": 19230 + }, + { + "epoch": 0.5501072194424589, + "grad_norm": 0.5374972224235535, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0345, + "step": 19240 + }, + { + "epoch": 0.5503931379556827, + "grad_norm": 0.49830907583236694, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0328, + "step": 19250 + }, + { + "epoch": 0.5506790564689064, + "grad_norm": 0.6223296523094177, + "learning_rate": 7.435514206212475e-06, + "loss": 0.037, + "step": 19260 + }, + { + "epoch": 0.55096497498213, + "grad_norm": 0.42801398038864136, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0371, + "step": 19270 + }, + { + "epoch": 0.5512508934953538, + "grad_norm": 0.3872825801372528, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0314, + "step": 19280 + }, + { + "epoch": 0.5515368120085775, + "grad_norm": 0.3967494070529938, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0349, + "step": 19290 + }, + { + "epoch": 0.5518227305218013, + "grad_norm": 0.42383769154548645, + "learning_rate": 7.399737764864619e-06, + "loss": 0.045, + "step": 19300 + }, + { + "epoch": 0.552108649035025, + "grad_norm": 0.48501884937286377, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0373, + "step": 19310 + }, + { + "epoch": 0.5523945675482488, + "grad_norm": 0.3783693015575409, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0334, + "step": 19320 + }, + { + "epoch": 0.5526804860614725, + "grad_norm": 0.5733019709587097, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0369, + "step": 19330 + }, + { + "epoch": 0.5529664045746963, + "grad_norm": 0.5022825002670288, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0375, + "step": 19340 + }, + { + "epoch": 0.5532523230879199, + "grad_norm": 0.5508015155792236, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0415, + "step": 19350 + }, + { + "epoch": 0.5535382416011436, + "grad_norm": 0.5692425966262817, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0401, + "step": 19360 + }, + { + "epoch": 0.5538241601143674, + "grad_norm": 0.7247840762138367, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0373, + "step": 19370 + }, + { + "epoch": 0.5541100786275911, + "grad_norm": 0.633986234664917, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0435, + "step": 19380 + }, + { + "epoch": 0.5543959971408149, + "grad_norm": 0.8598711490631104, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0424, + "step": 19390 + }, + { + "epoch": 0.5546819156540386, + "grad_norm": 0.782328188419342, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0424, + "step": 19400 + }, + { + "epoch": 0.5549678341672624, + "grad_norm": 0.48890456557273865, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0351, + "step": 19410 + }, + { + "epoch": 0.555253752680486, + "grad_norm": 0.4759981036186218, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0395, + "step": 19420 + }, + { + "epoch": 0.5555396711937098, + "grad_norm": 0.6431323885917664, + "learning_rate": 7.283934675167239e-06, + "loss": 0.036, + "step": 19430 + }, + { + "epoch": 0.5558255897069335, + "grad_norm": 0.6633809208869934, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0333, + "step": 19440 + }, + { + "epoch": 0.5561115082201572, + "grad_norm": 0.3405994772911072, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0375, + "step": 19450 + }, + { + "epoch": 0.556397426733381, + "grad_norm": 0.3443987965583801, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0329, + "step": 19460 + }, + { + "epoch": 0.5566833452466047, + "grad_norm": 0.7973398566246033, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0412, + "step": 19470 + }, + { + "epoch": 0.5569692637598285, + "grad_norm": 0.43843239545822144, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0302, + "step": 19480 + }, + { + "epoch": 0.5572551822730522, + "grad_norm": 0.6797782182693481, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0401, + "step": 19490 + }, + { + "epoch": 0.557541100786276, + "grad_norm": 0.5020610690116882, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0329, + "step": 19500 + }, + { + "epoch": 0.5578270192994996, + "grad_norm": 0.5093050003051758, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0381, + "step": 19510 + }, + { + "epoch": 0.5581129378127234, + "grad_norm": 0.6136947870254517, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0373, + "step": 19520 + }, + { + "epoch": 0.5583988563259471, + "grad_norm": 0.4213317930698395, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0338, + "step": 19530 + }, + { + "epoch": 0.5586847748391708, + "grad_norm": 0.6560636162757874, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0332, + "step": 19540 + }, + { + "epoch": 0.5589706933523946, + "grad_norm": 0.41303765773773193, + "learning_rate": 7.177693135871202e-06, + "loss": 0.03, + "step": 19550 + }, + { + "epoch": 0.5592566118656183, + "grad_norm": 0.5260538458824158, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0328, + "step": 19560 + }, + { + "epoch": 0.559542530378842, + "grad_norm": 0.6076327562332153, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0428, + "step": 19570 + }, + { + "epoch": 0.5598284488920657, + "grad_norm": 0.635111927986145, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0318, + "step": 19580 + }, + { + "epoch": 0.5601143674052895, + "grad_norm": 0.7933056354522705, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0357, + "step": 19590 + }, + { + "epoch": 0.5604002859185132, + "grad_norm": 0.44312241673469543, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0289, + "step": 19600 + }, + { + "epoch": 0.560686204431737, + "grad_norm": 0.36346134543418884, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0354, + "step": 19610 + }, + { + "epoch": 0.5609721229449607, + "grad_norm": 0.49605289101600647, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0367, + "step": 19620 + }, + { + "epoch": 0.5612580414581844, + "grad_norm": 0.7115452289581299, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0354, + "step": 19630 + }, + { + "epoch": 0.5615439599714082, + "grad_norm": 0.650925874710083, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0353, + "step": 19640 + }, + { + "epoch": 0.5618298784846318, + "grad_norm": 0.5046663880348206, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0294, + "step": 19650 + }, + { + "epoch": 0.5621157969978556, + "grad_norm": 0.4441855549812317, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0326, + "step": 19660 + }, + { + "epoch": 0.5624017155110793, + "grad_norm": 0.3956650495529175, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0446, + "step": 19670 + }, + { + "epoch": 0.5626876340243031, + "grad_norm": 0.5384211540222168, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0331, + "step": 19680 + }, + { + "epoch": 0.5629735525375268, + "grad_norm": 0.6183366775512695, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0324, + "step": 19690 + }, + { + "epoch": 0.5632594710507506, + "grad_norm": 0.9116242527961731, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0341, + "step": 19700 + }, + { + "epoch": 0.5635453895639743, + "grad_norm": 0.8171015381813049, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0306, + "step": 19710 + }, + { + "epoch": 0.563831308077198, + "grad_norm": 0.42670243978500366, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0336, + "step": 19720 + }, + { + "epoch": 0.5641172265904217, + "grad_norm": 0.7338811159133911, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0363, + "step": 19730 + }, + { + "epoch": 0.5644031451036454, + "grad_norm": 0.5576338171958923, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0371, + "step": 19740 + }, + { + "epoch": 0.5646890636168692, + "grad_norm": 0.7390629649162292, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0382, + "step": 19750 + }, + { + "epoch": 0.5649749821300929, + "grad_norm": 0.801812469959259, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0379, + "step": 19760 + }, + { + "epoch": 0.5652609006433167, + "grad_norm": 0.5697385668754578, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0369, + "step": 19770 + }, + { + "epoch": 0.5655468191565404, + "grad_norm": 0.4180932343006134, + "learning_rate": 6.975884226362e-06, + "loss": 0.039, + "step": 19780 + }, + { + "epoch": 0.5658327376697642, + "grad_norm": 0.648389995098114, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0346, + "step": 19790 + }, + { + "epoch": 0.5661186561829878, + "grad_norm": 0.9673929214477539, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0392, + "step": 19800 + }, + { + "epoch": 0.5664045746962115, + "grad_norm": 0.4793975353240967, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0321, + "step": 19810 + }, + { + "epoch": 0.5666904932094353, + "grad_norm": 0.5206098556518555, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0319, + "step": 19820 + }, + { + "epoch": 0.566976411722659, + "grad_norm": 0.39929306507110596, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0335, + "step": 19830 + }, + { + "epoch": 0.5672623302358828, + "grad_norm": 0.6819440722465515, + "learning_rate": 6.923644220932124e-06, + "loss": 0.0338, + "step": 19840 + }, + { + "epoch": 0.5675482487491065, + "grad_norm": 0.7612042427062988, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0345, + "step": 19850 + }, + { + "epoch": 0.5678341672623303, + "grad_norm": 0.472676545381546, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0388, + "step": 19860 + }, + { + "epoch": 0.568120085775554, + "grad_norm": 0.48102107644081116, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0304, + "step": 19870 + }, + { + "epoch": 0.5684060042887777, + "grad_norm": 0.4174644649028778, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0315, + "step": 19880 + }, + { + "epoch": 0.5686919228020014, + "grad_norm": 0.4218151271343231, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0413, + "step": 19890 + }, + { + "epoch": 0.5689778413152251, + "grad_norm": 0.8243978023529053, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0399, + "step": 19900 + }, + { + "epoch": 0.5692637598284489, + "grad_norm": 0.400924414396286, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0298, + "step": 19910 + }, + { + "epoch": 0.5695496783416726, + "grad_norm": 0.5199277400970459, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0351, + "step": 19920 + }, + { + "epoch": 0.5698355968548964, + "grad_norm": 0.5238781571388245, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0374, + "step": 19930 + }, + { + "epoch": 0.5701215153681201, + "grad_norm": 0.7451756596565247, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0378, + "step": 19940 + }, + { + "epoch": 0.5704074338813439, + "grad_norm": 0.5029926300048828, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0391, + "step": 19950 + }, + { + "epoch": 0.5706933523945675, + "grad_norm": 0.5532147884368896, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0383, + "step": 19960 + }, + { + "epoch": 0.5709792709077913, + "grad_norm": 0.5694131851196289, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0364, + "step": 19970 + }, + { + "epoch": 0.571265189421015, + "grad_norm": 0.5066515803337097, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0363, + "step": 19980 + }, + { + "epoch": 0.5715511079342387, + "grad_norm": 0.5676470398902893, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0327, + "step": 19990 + }, + { + "epoch": 0.5718370264474625, + "grad_norm": 0.37414318323135376, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0395, + "step": 20000 + }, + { + "epoch": 0.5721229449606862, + "grad_norm": 0.5888793468475342, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0372, + "step": 20010 + }, + { + "epoch": 0.57240886347391, + "grad_norm": 0.6593262553215027, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0329, + "step": 20020 + }, + { + "epoch": 0.5726947819871336, + "grad_norm": 0.6382879614830017, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0286, + "step": 20030 + }, + { + "epoch": 0.5729807005003574, + "grad_norm": 0.6364927887916565, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0383, + "step": 20040 + }, + { + "epoch": 0.5732666190135811, + "grad_norm": 0.4102194011211395, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0342, + "step": 20050 + }, + { + "epoch": 0.5735525375268049, + "grad_norm": 0.6449235081672668, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0315, + "step": 20060 + }, + { + "epoch": 0.5738384560400286, + "grad_norm": 0.708431601524353, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0316, + "step": 20070 + }, + { + "epoch": 0.5741243745532523, + "grad_norm": 0.46444272994995117, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0352, + "step": 20080 + }, + { + "epoch": 0.5744102930664761, + "grad_norm": 0.7026715278625488, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0337, + "step": 20090 + }, + { + "epoch": 0.5746962115796997, + "grad_norm": 0.43397894501686096, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0303, + "step": 20100 + }, + { + "epoch": 0.5749821300929235, + "grad_norm": 0.4937734305858612, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0403, + "step": 20110 + }, + { + "epoch": 0.5752680486061472, + "grad_norm": 0.5981410145759583, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0375, + "step": 20120 + }, + { + "epoch": 0.575553967119371, + "grad_norm": 0.5616198778152466, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0314, + "step": 20130 + }, + { + "epoch": 0.5758398856325947, + "grad_norm": 0.35028502345085144, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0367, + "step": 20140 + }, + { + "epoch": 0.5761258041458185, + "grad_norm": 0.3556109666824341, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0308, + "step": 20150 + }, + { + "epoch": 0.5764117226590422, + "grad_norm": 0.579409658908844, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0344, + "step": 20160 + }, + { + "epoch": 0.5766976411722659, + "grad_norm": 0.4484683573246002, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0312, + "step": 20170 + }, + { + "epoch": 0.5769835596854896, + "grad_norm": 0.3636038899421692, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0337, + "step": 20180 + }, + { + "epoch": 0.5772694781987133, + "grad_norm": 0.6667287349700928, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0343, + "step": 20190 + }, + { + "epoch": 0.5775553967119371, + "grad_norm": 0.26031574606895447, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0303, + "step": 20200 + }, + { + "epoch": 0.5778413152251608, + "grad_norm": 0.6683355569839478, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0316, + "step": 20210 + }, + { + "epoch": 0.5781272337383846, + "grad_norm": 0.4097786843776703, + "learning_rate": 6.596880604028027e-06, + "loss": 0.0346, + "step": 20220 + }, + { + "epoch": 0.5784131522516083, + "grad_norm": 0.45405757427215576, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0345, + "step": 20230 + }, + { + "epoch": 0.5786990707648321, + "grad_norm": 0.28291839361190796, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0323, + "step": 20240 + }, + { + "epoch": 0.5789849892780558, + "grad_norm": 0.5656186938285828, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0299, + "step": 20250 + }, + { + "epoch": 0.5792709077912794, + "grad_norm": 0.6780310869216919, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0309, + "step": 20260 + }, + { + "epoch": 0.5795568263045032, + "grad_norm": 0.3968813121318817, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0347, + "step": 20270 + }, + { + "epoch": 0.5798427448177269, + "grad_norm": 0.6598440408706665, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0329, + "step": 20280 + }, + { + "epoch": 0.5801286633309507, + "grad_norm": 0.4988970458507538, + "learning_rate": 6.53748481975927e-06, + "loss": 0.038, + "step": 20290 + }, + { + "epoch": 0.5804145818441744, + "grad_norm": 0.8016706705093384, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0358, + "step": 20300 + }, + { + "epoch": 0.5807005003573982, + "grad_norm": 0.8367684483528137, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0354, + "step": 20310 + }, + { + "epoch": 0.5809864188706219, + "grad_norm": 0.5730129480361938, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0421, + "step": 20320 + }, + { + "epoch": 0.5812723373838456, + "grad_norm": 0.43631577491760254, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0361, + "step": 20330 + }, + { + "epoch": 0.5815582558970693, + "grad_norm": 0.7001264691352844, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0355, + "step": 20340 + }, + { + "epoch": 0.581844174410293, + "grad_norm": 0.4988951086997986, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0387, + "step": 20350 + }, + { + "epoch": 0.5821300929235168, + "grad_norm": 0.45731016993522644, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0398, + "step": 20360 + }, + { + "epoch": 0.5824160114367405, + "grad_norm": 0.38684406876564026, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0345, + "step": 20370 + }, + { + "epoch": 0.5827019299499643, + "grad_norm": 0.3924580514431, + "learning_rate": 6.461496350649529e-06, + "loss": 0.037, + "step": 20380 + }, + { + "epoch": 0.582987848463188, + "grad_norm": 0.43735265731811523, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0371, + "step": 20390 + }, + { + "epoch": 0.5832737669764118, + "grad_norm": 0.4595138430595398, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0337, + "step": 20400 + }, + { + "epoch": 0.5835596854896354, + "grad_norm": 0.429569810628891, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0284, + "step": 20410 + }, + { + "epoch": 0.5838456040028592, + "grad_norm": 0.5399166345596313, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0353, + "step": 20420 + }, + { + "epoch": 0.5841315225160829, + "grad_norm": 0.5698734521865845, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0361, + "step": 20430 + }, + { + "epoch": 0.5844174410293066, + "grad_norm": 0.35422587394714355, + "learning_rate": 6.411076603575166e-06, + "loss": 0.033, + "step": 20440 + }, + { + "epoch": 0.5847033595425304, + "grad_norm": 0.4475875198841095, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0344, + "step": 20450 + }, + { + "epoch": 0.5849892780557541, + "grad_norm": 0.4950159192085266, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0428, + "step": 20460 + }, + { + "epoch": 0.5852751965689779, + "grad_norm": 0.695249617099762, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0354, + "step": 20470 + }, + { + "epoch": 0.5855611150822015, + "grad_norm": 0.2538593113422394, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0383, + "step": 20480 + }, + { + "epoch": 0.5858470335954253, + "grad_norm": 0.6770910024642944, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0364, + "step": 20490 + }, + { + "epoch": 0.586132952108649, + "grad_norm": 0.7187057733535767, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0319, + "step": 20500 + }, + { + "epoch": 0.5864188706218728, + "grad_norm": 0.34853193163871765, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.033, + "step": 20510 + }, + { + "epoch": 0.5867047891350965, + "grad_norm": 0.8484768271446228, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0331, + "step": 20520 + }, + { + "epoch": 0.5869907076483202, + "grad_norm": 0.6645244359970093, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0353, + "step": 20530 + }, + { + "epoch": 0.587276626161544, + "grad_norm": 0.5094996690750122, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0374, + "step": 20540 + }, + { + "epoch": 0.5875625446747677, + "grad_norm": 0.5012859106063843, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0329, + "step": 20550 + }, + { + "epoch": 0.5878484631879914, + "grad_norm": 0.6465861797332764, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0282, + "step": 20560 + }, + { + "epoch": 0.5881343817012151, + "grad_norm": 0.5694834589958191, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0313, + "step": 20570 + }, + { + "epoch": 0.5884203002144389, + "grad_norm": 0.4945555627346039, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0353, + "step": 20580 + }, + { + "epoch": 0.5887062187276626, + "grad_norm": 0.5606586933135986, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0343, + "step": 20590 + }, + { + "epoch": 0.5889921372408864, + "grad_norm": 0.6913802027702332, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0358, + "step": 20600 + }, + { + "epoch": 0.5892780557541101, + "grad_norm": 0.8119901418685913, + "learning_rate": 6.269280523549298e-06, + "loss": 0.038, + "step": 20610 + }, + { + "epoch": 0.5895639742673338, + "grad_norm": 0.5558752417564392, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0311, + "step": 20620 + }, + { + "epoch": 0.5898498927805575, + "grad_norm": 0.45028987526893616, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0321, + "step": 20630 + }, + { + "epoch": 0.5901358112937812, + "grad_norm": 0.3697125017642975, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0331, + "step": 20640 + }, + { + "epoch": 0.590421729807005, + "grad_norm": 0.5406038761138916, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0445, + "step": 20650 + }, + { + "epoch": 0.5907076483202287, + "grad_norm": 0.4301048219203949, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0371, + "step": 20660 + }, + { + "epoch": 0.5909935668334525, + "grad_norm": 0.6343403458595276, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0353, + "step": 20670 + }, + { + "epoch": 0.5912794853466762, + "grad_norm": 0.4666310250759125, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0352, + "step": 20680 + }, + { + "epoch": 0.5915654038599, + "grad_norm": 0.7471063733100891, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0352, + "step": 20690 + }, + { + "epoch": 0.5918513223731237, + "grad_norm": 0.9971692562103271, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0327, + "step": 20700 + }, + { + "epoch": 0.5921372408863473, + "grad_norm": 0.5646237134933472, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0365, + "step": 20710 + }, + { + "epoch": 0.5924231593995711, + "grad_norm": 0.46781328320503235, + "learning_rate": 6.17838207381795e-06, + "loss": 0.042, + "step": 20720 + }, + { + "epoch": 0.5927090779127948, + "grad_norm": 0.7061547040939331, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0484, + "step": 20730 + }, + { + "epoch": 0.5929949964260186, + "grad_norm": 0.6651175618171692, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0353, + "step": 20740 + }, + { + "epoch": 0.5932809149392423, + "grad_norm": 0.5959596037864685, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0344, + "step": 20750 + }, + { + "epoch": 0.5935668334524661, + "grad_norm": 0.5869056582450867, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0389, + "step": 20760 + }, + { + "epoch": 0.5938527519656898, + "grad_norm": 0.42101356387138367, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0288, + "step": 20770 + }, + { + "epoch": 0.5941386704789136, + "grad_norm": 0.6310023069381714, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0362, + "step": 20780 + }, + { + "epoch": 0.5944245889921372, + "grad_norm": 0.6737013459205627, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0377, + "step": 20790 + }, + { + "epoch": 0.5947105075053609, + "grad_norm": 0.6716046333312988, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0415, + "step": 20800 + }, + { + "epoch": 0.5949964260185847, + "grad_norm": 0.9742669463157654, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0337, + "step": 20810 + }, + { + "epoch": 0.5952823445318084, + "grad_norm": 0.571782648563385, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0362, + "step": 20820 + }, + { + "epoch": 0.5955682630450322, + "grad_norm": 0.9673911333084106, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0362, + "step": 20830 + }, + { + "epoch": 0.5958541815582559, + "grad_norm": 0.5391695499420166, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0331, + "step": 20840 + }, + { + "epoch": 0.5961401000714797, + "grad_norm": 1.4766349792480469, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0332, + "step": 20850 + }, + { + "epoch": 0.5964260185847033, + "grad_norm": 0.6329004168510437, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0375, + "step": 20860 + }, + { + "epoch": 0.5967119370979271, + "grad_norm": 0.6745501160621643, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0347, + "step": 20870 + }, + { + "epoch": 0.5969978556111508, + "grad_norm": 0.3006536364555359, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0321, + "step": 20880 + }, + { + "epoch": 0.5972837741243745, + "grad_norm": 0.4666125476360321, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0363, + "step": 20890 + }, + { + "epoch": 0.5975696926375983, + "grad_norm": 0.3881456255912781, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0318, + "step": 20900 + }, + { + "epoch": 0.597855611150822, + "grad_norm": 0.4211449921131134, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0357, + "step": 20910 + }, + { + "epoch": 0.5981415296640458, + "grad_norm": 1.125683307647705, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0364, + "step": 20920 + }, + { + "epoch": 0.5984274481772694, + "grad_norm": 0.9670853614807129, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0385, + "step": 20930 + }, + { + "epoch": 0.5987133666904932, + "grad_norm": 0.7302138209342957, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0321, + "step": 20940 + }, + { + "epoch": 0.5989992852037169, + "grad_norm": 0.7883613109588623, + "learning_rate": 5.990549152010853e-06, + "loss": 0.038, + "step": 20950 + }, + { + "epoch": 0.5992852037169407, + "grad_norm": 0.44051188230514526, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0423, + "step": 20960 + }, + { + "epoch": 0.5995711222301644, + "grad_norm": 0.5225116014480591, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0293, + "step": 20970 + }, + { + "epoch": 0.5998570407433881, + "grad_norm": 0.44672495126724243, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0314, + "step": 20980 + }, + { + "epoch": 0.6001429592566119, + "grad_norm": 0.4489240050315857, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0298, + "step": 20990 + }, + { + "epoch": 0.6004288777698356, + "grad_norm": 0.3942757844924927, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0323, + "step": 21000 + }, + { + "epoch": 0.6007147962830593, + "grad_norm": 0.5079668760299683, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0435, + "step": 21010 + }, + { + "epoch": 0.601000714796283, + "grad_norm": 0.5057359933853149, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0364, + "step": 21020 + }, + { + "epoch": 0.6012866333095068, + "grad_norm": 0.4823545515537262, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0408, + "step": 21030 + }, + { + "epoch": 0.6015725518227305, + "grad_norm": 0.42647498846054077, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0366, + "step": 21040 + }, + { + "epoch": 0.6018584703359543, + "grad_norm": 0.5967830419540405, + "learning_rate": 5.909845843697164e-06, + "loss": 0.037, + "step": 21050 + }, + { + "epoch": 0.602144388849178, + "grad_norm": 0.4567292034626007, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0306, + "step": 21060 + }, + { + "epoch": 0.6024303073624017, + "grad_norm": 0.6767273545265198, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0447, + "step": 21070 + }, + { + "epoch": 0.6027162258756255, + "grad_norm": 0.2957002520561218, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0339, + "step": 21080 + }, + { + "epoch": 0.6030021443888491, + "grad_norm": 0.6870969533920288, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0313, + "step": 21090 + }, + { + "epoch": 0.6032880629020729, + "grad_norm": 0.530910313129425, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0377, + "step": 21100 + }, + { + "epoch": 0.6035739814152966, + "grad_norm": 0.21370625495910645, + "learning_rate": 5.86170998451151e-06, + "loss": 0.032, + "step": 21110 + }, + { + "epoch": 0.6038598999285204, + "grad_norm": 0.6039503812789917, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0258, + "step": 21120 + }, + { + "epoch": 0.6041458184417441, + "grad_norm": 0.5375682711601257, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0341, + "step": 21130 + }, + { + "epoch": 0.6044317369549679, + "grad_norm": 0.4819096326828003, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0309, + "step": 21140 + }, + { + "epoch": 0.6047176554681916, + "grad_norm": 0.31165415048599243, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0278, + "step": 21150 + }, + { + "epoch": 0.6050035739814152, + "grad_norm": 0.2781001925468445, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0342, + "step": 21160 + }, + { + "epoch": 0.605289492494639, + "grad_norm": 0.44726037979125977, + "learning_rate": 5.813791207086085e-06, + "loss": 0.032, + "step": 21170 + }, + { + "epoch": 0.6055754110078627, + "grad_norm": 0.5762766599655151, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0325, + "step": 21180 + }, + { + "epoch": 0.6058613295210865, + "grad_norm": 0.49829939007759094, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0322, + "step": 21190 + }, + { + "epoch": 0.6061472480343102, + "grad_norm": 0.4683297276496887, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0369, + "step": 21200 + }, + { + "epoch": 0.606433166547534, + "grad_norm": 0.662159264087677, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0278, + "step": 21210 + }, + { + "epoch": 0.6067190850607577, + "grad_norm": 0.4397001564502716, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0366, + "step": 21220 + }, + { + "epoch": 0.6070050035739815, + "grad_norm": 0.4977007508277893, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0293, + "step": 21230 + }, + { + "epoch": 0.6072909220872051, + "grad_norm": 0.3705490827560425, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0315, + "step": 21240 + }, + { + "epoch": 0.6075768406004288, + "grad_norm": 0.6350240111351013, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0286, + "step": 21250 + }, + { + "epoch": 0.6078627591136526, + "grad_norm": 0.5590423941612244, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0373, + "step": 21260 + }, + { + "epoch": 0.6081486776268763, + "grad_norm": 0.5244049429893494, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0325, + "step": 21270 + }, + { + "epoch": 0.6084345961401001, + "grad_norm": 1.082044005393982, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0373, + "step": 21280 + }, + { + "epoch": 0.6087205146533238, + "grad_norm": 0.614028811454773, + "learning_rate": 5.71861298612245e-06, + "loss": 0.031, + "step": 21290 + }, + { + "epoch": 0.6090064331665476, + "grad_norm": 0.783205509185791, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0289, + "step": 21300 + }, + { + "epoch": 0.6092923516797712, + "grad_norm": 0.5420807600021362, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.031, + "step": 21310 + }, + { + "epoch": 0.609578270192995, + "grad_norm": 0.42979222536087036, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0291, + "step": 21320 + }, + { + "epoch": 0.6098641887062187, + "grad_norm": 0.44511356949806213, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.031, + "step": 21330 + }, + { + "epoch": 0.6101501072194424, + "grad_norm": 0.528799831867218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0269, + "step": 21340 + }, + { + "epoch": 0.6104360257326662, + "grad_norm": 0.43274471163749695, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0438, + "step": 21350 + }, + { + "epoch": 0.6107219442458899, + "grad_norm": 0.8020172715187073, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0393, + "step": 21360 + }, + { + "epoch": 0.6110078627591137, + "grad_norm": 0.4354296028614044, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0338, + "step": 21370 + }, + { + "epoch": 0.6112937812723374, + "grad_norm": 0.587364673614502, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0359, + "step": 21380 + }, + { + "epoch": 0.6115796997855611, + "grad_norm": 0.5426310300827026, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0333, + "step": 21390 + }, + { + "epoch": 0.6118656182987848, + "grad_norm": 0.5900459289550781, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0344, + "step": 21400 + }, + { + "epoch": 0.6121515368120086, + "grad_norm": 0.5652357935905457, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0396, + "step": 21410 + }, + { + "epoch": 0.6124374553252323, + "grad_norm": 0.5287114977836609, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0387, + "step": 21420 + }, + { + "epoch": 0.612723373838456, + "grad_norm": 0.7939184904098511, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0351, + "step": 21430 + }, + { + "epoch": 0.6130092923516798, + "grad_norm": 0.6840642094612122, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0349, + "step": 21440 + }, + { + "epoch": 0.6132952108649035, + "grad_norm": 0.3717428147792816, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0336, + "step": 21450 + }, + { + "epoch": 0.6135811293781273, + "grad_norm": 0.5073713064193726, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0326, + "step": 21460 + }, + { + "epoch": 0.6138670478913509, + "grad_norm": 1.1579232215881348, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0388, + "step": 21470 + }, + { + "epoch": 0.6141529664045747, + "grad_norm": 0.4209369122982025, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0307, + "step": 21480 + }, + { + "epoch": 0.6144388849177984, + "grad_norm": 0.38663822412490845, + "learning_rate": 5.561973825289734e-06, + "loss": 0.037, + "step": 21490 + }, + { + "epoch": 0.6147248034310222, + "grad_norm": 0.538270890712738, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0333, + "step": 21500 + }, + { + "epoch": 0.6150107219442459, + "grad_norm": 0.28280535340309143, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0319, + "step": 21510 + }, + { + "epoch": 0.6152966404574696, + "grad_norm": 0.5407803058624268, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0352, + "step": 21520 + }, + { + "epoch": 0.6155825589706934, + "grad_norm": 1.4600974321365356, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0409, + "step": 21530 + }, + { + "epoch": 0.615868477483917, + "grad_norm": 0.659900426864624, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0322, + "step": 21540 + }, + { + "epoch": 0.6161543959971408, + "grad_norm": 0.6401934623718262, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0339, + "step": 21550 + }, + { + "epoch": 0.6164403145103645, + "grad_norm": 0.6409866213798523, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0331, + "step": 21560 + }, + { + "epoch": 0.6167262330235883, + "grad_norm": 0.6627630591392517, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0332, + "step": 21570 + }, + { + "epoch": 0.617012151536812, + "grad_norm": 0.6180721521377563, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0327, + "step": 21580 + }, + { + "epoch": 0.6172980700500358, + "grad_norm": 0.4689866006374359, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0276, + "step": 21590 + }, + { + "epoch": 0.6175839885632595, + "grad_norm": 0.5039265751838684, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0427, + "step": 21600 + }, + { + "epoch": 0.6178699070764831, + "grad_norm": 0.5313833355903625, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0351, + "step": 21610 + }, + { + "epoch": 0.6181558255897069, + "grad_norm": 0.4919044077396393, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0327, + "step": 21620 + }, + { + "epoch": 0.6184417441029306, + "grad_norm": 0.5446444153785706, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0331, + "step": 21630 + }, + { + "epoch": 0.6187276626161544, + "grad_norm": 0.5198109745979309, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.032, + "step": 21640 + }, + { + "epoch": 0.6190135811293781, + "grad_norm": 0.5684625506401062, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0339, + "step": 21650 + }, + { + "epoch": 0.6192994996426019, + "grad_norm": 0.6882810592651367, + "learning_rate": 5.430834687545416e-06, + "loss": 0.035, + "step": 21660 + }, + { + "epoch": 0.6195854181558256, + "grad_norm": 0.7360101938247681, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0306, + "step": 21670 + }, + { + "epoch": 0.6198713366690494, + "grad_norm": 0.5557180047035217, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0241, + "step": 21680 + }, + { + "epoch": 0.620157255182273, + "grad_norm": 0.4302096962928772, + "learning_rate": 5.407887295494495e-06, + "loss": 0.035, + "step": 21690 + }, + { + "epoch": 0.6204431736954967, + "grad_norm": 0.4740016460418701, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0331, + "step": 21700 + }, + { + "epoch": 0.6207290922087205, + "grad_norm": 0.5400598049163818, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0297, + "step": 21710 + }, + { + "epoch": 0.6210150107219442, + "grad_norm": 0.4270641803741455, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0334, + "step": 21720 + }, + { + "epoch": 0.621300929235168, + "grad_norm": 0.41063550114631653, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0375, + "step": 21730 + }, + { + "epoch": 0.6215868477483917, + "grad_norm": 0.48556044697761536, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0291, + "step": 21740 + }, + { + "epoch": 0.6218727662616155, + "grad_norm": 0.2872731387615204, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0323, + "step": 21750 + }, + { + "epoch": 0.6221586847748392, + "grad_norm": 0.4088454246520996, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0307, + "step": 21760 + }, + { + "epoch": 0.622444603288063, + "grad_norm": 0.42600440979003906, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0326, + "step": 21770 + }, + { + "epoch": 0.6227305218012866, + "grad_norm": 0.36466315388679504, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0337, + "step": 21780 + }, + { + "epoch": 0.6230164403145103, + "grad_norm": 0.588921308517456, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0336, + "step": 21790 + }, + { + "epoch": 0.6233023588277341, + "grad_norm": 0.44768571853637695, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0326, + "step": 21800 + }, + { + "epoch": 0.6235882773409578, + "grad_norm": 1.1612637042999268, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0355, + "step": 21810 + }, + { + "epoch": 0.6238741958541816, + "grad_norm": 1.0912114381790161, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0436, + "step": 21820 + }, + { + "epoch": 0.6241601143674053, + "grad_norm": 0.5813164710998535, + "learning_rate": 5.301584321328435e-06, + "loss": 0.034, + "step": 21830 + }, + { + "epoch": 0.624446032880629, + "grad_norm": 0.45064911246299744, + "learning_rate": 5.294041118587667e-06, + "loss": 0.032, + "step": 21840 + }, + { + "epoch": 0.6247319513938527, + "grad_norm": 0.5173943638801575, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0322, + "step": 21850 + }, + { + "epoch": 0.6250178699070765, + "grad_norm": 0.41157352924346924, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0319, + "step": 21860 + }, + { + "epoch": 0.6253037884203002, + "grad_norm": 0.5711286067962646, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0288, + "step": 21870 + }, + { + "epoch": 0.6255897069335239, + "grad_norm": 0.5108116865158081, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0325, + "step": 21880 + }, + { + "epoch": 0.6258756254467477, + "grad_norm": 0.49562424421310425, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0306, + "step": 21890 + }, + { + "epoch": 0.6261615439599714, + "grad_norm": 0.3392108976840973, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0358, + "step": 21900 + }, + { + "epoch": 0.6264474624731952, + "grad_norm": 1.0588114261627197, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0404, + "step": 21910 + }, + { + "epoch": 0.6267333809864188, + "grad_norm": 0.6979959607124329, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0349, + "step": 21920 + }, + { + "epoch": 0.6270192994996426, + "grad_norm": 0.3185918927192688, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0276, + "step": 21930 + }, + { + "epoch": 0.6273052180128663, + "grad_norm": 0.3921501338481903, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0326, + "step": 21940 + }, + { + "epoch": 0.6275911365260901, + "grad_norm": 0.9666212797164917, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0346, + "step": 21950 + }, + { + "epoch": 0.6278770550393138, + "grad_norm": 0.4483211040496826, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0306, + "step": 21960 + }, + { + "epoch": 0.6281629735525375, + "grad_norm": 0.4839077293872833, + "learning_rate": 5.196592054173714e-06, + "loss": 0.026, + "step": 21970 + }, + { + "epoch": 0.6284488920657613, + "grad_norm": 0.5054528117179871, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0299, + "step": 21980 + }, + { + "epoch": 0.628734810578985, + "grad_norm": 0.5953076481819153, + "learning_rate": 5.181701567303612e-06, + "loss": 0.036, + "step": 21990 + }, + { + "epoch": 0.6290207290922087, + "grad_norm": 0.39300060272216797, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0358, + "step": 22000 + }, + { + "epoch": 0.6293066476054324, + "grad_norm": 0.42864665389060974, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0363, + "step": 22010 + }, + { + "epoch": 0.6295925661186562, + "grad_norm": 0.33609238266944885, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0398, + "step": 22020 + }, + { + "epoch": 0.6298784846318799, + "grad_norm": 0.4237107038497925, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0319, + "step": 22030 + }, + { + "epoch": 0.6301644031451037, + "grad_norm": 0.42774054408073425, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0332, + "step": 22040 + }, + { + "epoch": 0.6304503216583274, + "grad_norm": 0.8992825150489807, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0396, + "step": 22050 + }, + { + "epoch": 0.630736240171551, + "grad_norm": 0.20832861959934235, + "learning_rate": 5.129800405815733e-06, + "loss": 0.03, + "step": 22060 + }, + { + "epoch": 0.6310221586847748, + "grad_norm": 0.5961321592330933, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0429, + "step": 22070 + }, + { + "epoch": 0.6313080771979985, + "grad_norm": 0.5037736296653748, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0332, + "step": 22080 + }, + { + "epoch": 0.6315939957112223, + "grad_norm": 0.383732408285141, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0293, + "step": 22090 + }, + { + "epoch": 0.631879914224446, + "grad_norm": 0.8124368786811829, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0286, + "step": 22100 + }, + { + "epoch": 0.6321658327376698, + "grad_norm": 0.96833735704422, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0346, + "step": 22110 + }, + { + "epoch": 0.6324517512508935, + "grad_norm": 0.42382001876831055, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0345, + "step": 22120 + }, + { + "epoch": 0.6327376697641173, + "grad_norm": 0.5928776860237122, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0314, + "step": 22130 + }, + { + "epoch": 0.633023588277341, + "grad_norm": 0.7822670340538025, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0335, + "step": 22140 + }, + { + "epoch": 0.6333095067905646, + "grad_norm": 0.6383520364761353, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0321, + "step": 22150 + }, + { + "epoch": 0.6335954253037884, + "grad_norm": 0.3413240611553192, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0314, + "step": 22160 + }, + { + "epoch": 0.6338813438170121, + "grad_norm": 0.5960783958435059, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0385, + "step": 22170 + }, + { + "epoch": 0.6341672623302359, + "grad_norm": 0.2557702660560608, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0316, + "step": 22180 + }, + { + "epoch": 0.6344531808434596, + "grad_norm": 0.6229982376098633, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0325, + "step": 22190 + }, + { + "epoch": 0.6347390993566834, + "grad_norm": 0.5080077052116394, + "learning_rate": 5.027013727107874e-06, + "loss": 0.036, + "step": 22200 + }, + { + "epoch": 0.6350250178699071, + "grad_norm": 0.5630851984024048, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0279, + "step": 22210 + }, + { + "epoch": 0.6353109363831309, + "grad_norm": 0.81584233045578, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0343, + "step": 22220 + }, + { + "epoch": 0.6355968548963545, + "grad_norm": 0.3985321521759033, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0289, + "step": 22230 + }, + { + "epoch": 0.6358827734095782, + "grad_norm": 0.4481184482574463, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0345, + "step": 22240 + }, + { + "epoch": 0.636168691922802, + "grad_norm": 0.3640075623989105, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0307, + "step": 22250 + }, + { + "epoch": 0.6364546104360257, + "grad_norm": 0.4006771147251129, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0381, + "step": 22260 + }, + { + "epoch": 0.6367405289492495, + "grad_norm": 0.7638134360313416, + "learning_rate": 4.976134120528886e-06, + "loss": 0.039, + "step": 22270 + }, + { + "epoch": 0.6370264474624732, + "grad_norm": 0.4820837080478668, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0281, + "step": 22280 + }, + { + "epoch": 0.637312365975697, + "grad_norm": 0.5928444266319275, + "learning_rate": 4.961660586405147e-06, + "loss": 0.033, + "step": 22290 + }, + { + "epoch": 0.6375982844889206, + "grad_norm": 0.50687575340271, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0357, + "step": 22300 + }, + { + "epoch": 0.6378842030021444, + "grad_norm": 0.673939049243927, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0301, + "step": 22310 + }, + { + "epoch": 0.6381701215153681, + "grad_norm": 0.4300031065940857, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.029, + "step": 22320 + }, + { + "epoch": 0.6384560400285918, + "grad_norm": 0.6585102677345276, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0333, + "step": 22330 + }, + { + "epoch": 0.6387419585418156, + "grad_norm": 0.6430448889732361, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0345, + "step": 22340 + }, + { + "epoch": 0.6390278770550393, + "grad_norm": 0.8272712826728821, + "learning_rate": 4.918410326949594e-06, + "loss": 0.034, + "step": 22350 + }, + { + "epoch": 0.6393137955682631, + "grad_norm": 0.7631726861000061, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0321, + "step": 22360 + }, + { + "epoch": 0.6395997140814867, + "grad_norm": 0.5562252402305603, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0339, + "step": 22370 + }, + { + "epoch": 0.6398856325947105, + "grad_norm": 0.6027814149856567, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0355, + "step": 22380 + }, + { + "epoch": 0.6401715511079342, + "grad_norm": 0.3548984229564667, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0343, + "step": 22390 + }, + { + "epoch": 0.640457469621158, + "grad_norm": 0.4959709346294403, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.031, + "step": 22400 + }, + { + "epoch": 0.6407433881343817, + "grad_norm": 0.3765028715133667, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0406, + "step": 22410 + }, + { + "epoch": 0.6410293066476054, + "grad_norm": 0.5014662146568298, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0297, + "step": 22420 + }, + { + "epoch": 0.6413152251608292, + "grad_norm": 0.5085675716400146, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0324, + "step": 22430 + }, + { + "epoch": 0.6416011436740529, + "grad_norm": 0.37595826387405396, + "learning_rate": 4.854017257346105e-06, + "loss": 0.033, + "step": 22440 + }, + { + "epoch": 0.6418870621872766, + "grad_norm": 0.5408678650856018, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0323, + "step": 22450 + }, + { + "epoch": 0.6421729807005003, + "grad_norm": 0.4319652020931244, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0269, + "step": 22460 + }, + { + "epoch": 0.6424588992137241, + "grad_norm": 0.41388124227523804, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0311, + "step": 22470 + }, + { + "epoch": 0.6427448177269478, + "grad_norm": 0.4778555631637573, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0332, + "step": 22480 + }, + { + "epoch": 0.6430307362401716, + "grad_norm": 0.38835474848747253, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0304, + "step": 22490 + }, + { + "epoch": 0.6433166547533953, + "grad_norm": 0.5165611505508423, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0344, + "step": 22500 + }, + { + "epoch": 0.643602573266619, + "grad_norm": 0.4285198450088501, + "learning_rate": 4.804337352679613e-06, + "loss": 0.035, + "step": 22510 + }, + { + "epoch": 0.6438884917798428, + "grad_norm": 0.4512922167778015, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0354, + "step": 22520 + }, + { + "epoch": 0.6441744102930664, + "grad_norm": 0.33437663316726685, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0343, + "step": 22530 + }, + { + "epoch": 0.6444603288062902, + "grad_norm": 0.45291104912757874, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0307, + "step": 22540 + }, + { + "epoch": 0.6447462473195139, + "grad_norm": 0.5920093655586243, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0338, + "step": 22550 + }, + { + "epoch": 0.6450321658327377, + "grad_norm": 0.6362392902374268, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0335, + "step": 22560 + }, + { + "epoch": 0.6453180843459614, + "grad_norm": 0.28033652901649475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0267, + "step": 22570 + }, + { + "epoch": 0.6456040028591852, + "grad_norm": 0.4563148617744446, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0316, + "step": 22580 + }, + { + "epoch": 0.6458899213724089, + "grad_norm": 0.4889507591724396, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.034, + "step": 22590 + }, + { + "epoch": 0.6461758398856325, + "grad_norm": 0.6826061010360718, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0316, + "step": 22600 + }, + { + "epoch": 0.6464617583988563, + "grad_norm": 0.45066431164741516, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0287, + "step": 22610 + }, + { + "epoch": 0.64674767691208, + "grad_norm": 0.41994187235832214, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0312, + "step": 22620 + }, + { + "epoch": 0.6470335954253038, + "grad_norm": 0.39731675386428833, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0347, + "step": 22630 + }, + { + "epoch": 0.6473195139385275, + "grad_norm": 0.5207498073577881, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0304, + "step": 22640 + }, + { + "epoch": 0.6476054324517513, + "grad_norm": 0.42930668592453003, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0344, + "step": 22650 + }, + { + "epoch": 0.647891350964975, + "grad_norm": 0.3023674488067627, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0359, + "step": 22660 + }, + { + "epoch": 0.6481772694781988, + "grad_norm": 0.43205010890960693, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0323, + "step": 22670 + }, + { + "epoch": 0.6484631879914224, + "grad_norm": 0.5984707474708557, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0318, + "step": 22680 + }, + { + "epoch": 0.6487491065046461, + "grad_norm": 0.43477800488471985, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0346, + "step": 22690 + }, + { + "epoch": 0.6490350250178699, + "grad_norm": 0.3570900857448578, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0342, + "step": 22700 + }, + { + "epoch": 0.6493209435310936, + "grad_norm": 0.47367945313453674, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0367, + "step": 22710 + }, + { + "epoch": 0.6496068620443174, + "grad_norm": 0.3768099844455719, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0357, + "step": 22720 + }, + { + "epoch": 0.6498927805575411, + "grad_norm": 0.6188724040985107, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0299, + "step": 22730 + }, + { + "epoch": 0.6501786990707649, + "grad_norm": 0.5733038783073425, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0343, + "step": 22740 + }, + { + "epoch": 0.6504646175839885, + "grad_norm": 0.5000156164169312, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0284, + "step": 22750 + }, + { + "epoch": 0.6507505360972123, + "grad_norm": 0.22813546657562256, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0288, + "step": 22760 + }, + { + "epoch": 0.651036454610436, + "grad_norm": 0.4805088937282562, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0305, + "step": 22770 + }, + { + "epoch": 0.6513223731236597, + "grad_norm": 0.4652612507343292, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0304, + "step": 22780 + }, + { + "epoch": 0.6516082916368835, + "grad_norm": 0.5010579824447632, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0337, + "step": 22790 + }, + { + "epoch": 0.6518942101501072, + "grad_norm": 0.36260518431663513, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0284, + "step": 22800 + }, + { + "epoch": 0.652180128663331, + "grad_norm": 0.45098820328712463, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0297, + "step": 22810 + }, + { + "epoch": 0.6524660471765547, + "grad_norm": 0.6154504418373108, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0366, + "step": 22820 + }, + { + "epoch": 0.6527519656897784, + "grad_norm": 0.4522152543067932, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.033, + "step": 22830 + }, + { + "epoch": 0.6530378842030021, + "grad_norm": 0.34195253252983093, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0304, + "step": 22840 + }, + { + "epoch": 0.6533238027162259, + "grad_norm": 0.49787941575050354, + "learning_rate": 4.568154392147005e-06, + "loss": 0.033, + "step": 22850 + }, + { + "epoch": 0.6536097212294496, + "grad_norm": 0.5249335765838623, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0381, + "step": 22860 + }, + { + "epoch": 0.6538956397426733, + "grad_norm": 0.7645581960678101, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0298, + "step": 22870 + }, + { + "epoch": 0.6541815582558971, + "grad_norm": 0.6034232974052429, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0313, + "step": 22880 + }, + { + "epoch": 0.6544674767691208, + "grad_norm": 0.3499184846878052, + "learning_rate": 4.54093567906903e-06, + "loss": 0.036, + "step": 22890 + }, + { + "epoch": 0.6547533952823446, + "grad_norm": 0.4157135486602783, + "learning_rate": 4.534149931036931e-06, + "loss": 0.033, + "step": 22900 + }, + { + "epoch": 0.6550393137955682, + "grad_norm": 0.4563712775707245, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0361, + "step": 22910 + }, + { + "epoch": 0.655325232308792, + "grad_norm": 1.080802321434021, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0307, + "step": 22920 + }, + { + "epoch": 0.6556111508220157, + "grad_norm": 0.38259357213974, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0292, + "step": 22930 + }, + { + "epoch": 0.6558970693352395, + "grad_norm": 0.6920587420463562, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0322, + "step": 22940 + }, + { + "epoch": 0.6561829878484632, + "grad_norm": 0.628978967666626, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0391, + "step": 22950 + }, + { + "epoch": 0.6564689063616869, + "grad_norm": 0.4848436713218689, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0306, + "step": 22960 + }, + { + "epoch": 0.6567548248749107, + "grad_norm": 0.4478876292705536, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0334, + "step": 22970 + }, + { + "epoch": 0.6570407433881343, + "grad_norm": 0.47360673546791077, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0357, + "step": 22980 + }, + { + "epoch": 0.6573266619013581, + "grad_norm": 0.32840496301651, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0339, + "step": 22990 + }, + { + "epoch": 0.6576125804145818, + "grad_norm": 0.4047236442565918, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0321, + "step": 23000 + }, + { + "epoch": 0.6578984989278056, + "grad_norm": 0.7817053198814392, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0375, + "step": 23010 + }, + { + "epoch": 0.6581844174410293, + "grad_norm": 0.38985809683799744, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0343, + "step": 23020 + }, + { + "epoch": 0.6584703359542531, + "grad_norm": 0.45360830426216125, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0287, + "step": 23030 + }, + { + "epoch": 0.6587562544674768, + "grad_norm": 0.2886345088481903, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0322, + "step": 23040 + }, + { + "epoch": 0.6590421729807004, + "grad_norm": 0.8546258211135864, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0331, + "step": 23050 + }, + { + "epoch": 0.6593280914939242, + "grad_norm": 0.48426172137260437, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0343, + "step": 23060 + }, + { + "epoch": 0.6596140100071479, + "grad_norm": 0.46379074454307556, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0296, + "step": 23070 + }, + { + "epoch": 0.6598999285203717, + "grad_norm": 0.7772185206413269, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0319, + "step": 23080 + }, + { + "epoch": 0.6601858470335954, + "grad_norm": 0.4606277644634247, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0336, + "step": 23090 + }, + { + "epoch": 0.6604717655468192, + "grad_norm": 0.43342530727386475, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0287, + "step": 23100 + }, + { + "epoch": 0.6607576840600429, + "grad_norm": 0.385151207447052, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0363, + "step": 23110 + }, + { + "epoch": 0.6610436025732667, + "grad_norm": 0.3960207998752594, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0306, + "step": 23120 + }, + { + "epoch": 0.6613295210864903, + "grad_norm": 0.41210439801216125, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0348, + "step": 23130 + }, + { + "epoch": 0.661615439599714, + "grad_norm": 0.41976168751716614, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0272, + "step": 23140 + }, + { + "epoch": 0.6619013581129378, + "grad_norm": 0.3195948004722595, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0362, + "step": 23150 + }, + { + "epoch": 0.6621872766261615, + "grad_norm": 0.7024016380310059, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0316, + "step": 23160 + }, + { + "epoch": 0.6624731951393853, + "grad_norm": 0.2894183099269867, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0339, + "step": 23170 + }, + { + "epoch": 0.662759113652609, + "grad_norm": 0.489715576171875, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0272, + "step": 23180 + }, + { + "epoch": 0.6630450321658328, + "grad_norm": 0.3406641185283661, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0272, + "step": 23190 + }, + { + "epoch": 0.6633309506790565, + "grad_norm": 0.3647848963737488, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0337, + "step": 23200 + }, + { + "epoch": 0.6636168691922802, + "grad_norm": 0.7023333311080933, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0334, + "step": 23210 + }, + { + "epoch": 0.6639027877055039, + "grad_norm": 0.43989211320877075, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0313, + "step": 23220 + }, + { + "epoch": 0.6641887062187276, + "grad_norm": 0.7329099774360657, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0283, + "step": 23230 + }, + { + "epoch": 0.6644746247319514, + "grad_norm": 0.3954019546508789, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0321, + "step": 23240 + }, + { + "epoch": 0.6647605432451751, + "grad_norm": 0.38020703196525574, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0337, + "step": 23250 + }, + { + "epoch": 0.6650464617583989, + "grad_norm": 0.5988985300064087, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0353, + "step": 23260 + }, + { + "epoch": 0.6653323802716226, + "grad_norm": 0.4259869158267975, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0316, + "step": 23270 + }, + { + "epoch": 0.6656182987848464, + "grad_norm": 0.4322545528411865, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0318, + "step": 23280 + }, + { + "epoch": 0.66590421729807, + "grad_norm": 0.40275540947914124, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0344, + "step": 23290 + }, + { + "epoch": 0.6661901358112938, + "grad_norm": 0.5070827603340149, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0336, + "step": 23300 + }, + { + "epoch": 0.6664760543245175, + "grad_norm": 0.614973247051239, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0352, + "step": 23310 + }, + { + "epoch": 0.6667619728377412, + "grad_norm": 0.4637722074985504, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0277, + "step": 23320 + }, + { + "epoch": 0.667047891350965, + "grad_norm": 0.34951677918434143, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0284, + "step": 23330 + }, + { + "epoch": 0.6673338098641887, + "grad_norm": 0.5609407424926758, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0304, + "step": 23340 + }, + { + "epoch": 0.6676197283774125, + "grad_norm": 0.44585973024368286, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0263, + "step": 23350 + }, + { + "epoch": 0.6679056468906361, + "grad_norm": 0.5311269760131836, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0311, + "step": 23360 + }, + { + "epoch": 0.6681915654038599, + "grad_norm": 0.4923100471496582, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0277, + "step": 23370 + }, + { + "epoch": 0.6684774839170836, + "grad_norm": 0.5254819989204407, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0328, + "step": 23380 + }, + { + "epoch": 0.6687634024303074, + "grad_norm": 0.47537869215011597, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0373, + "step": 23390 + }, + { + "epoch": 0.6690493209435311, + "grad_norm": 0.40087464451789856, + "learning_rate": 4.204700678381975e-06, + "loss": 0.034, + "step": 23400 + }, + { + "epoch": 0.6693352394567548, + "grad_norm": 0.5166190266609192, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0314, + "step": 23410 + }, + { + "epoch": 0.6696211579699786, + "grad_norm": 0.42874693870544434, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0279, + "step": 23420 + }, + { + "epoch": 0.6699070764832022, + "grad_norm": 0.3685651123523712, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0313, + "step": 23430 + }, + { + "epoch": 0.670192994996426, + "grad_norm": 0.5417486429214478, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.033, + "step": 23440 + }, + { + "epoch": 0.6704789135096497, + "grad_norm": 0.5764726996421814, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0368, + "step": 23450 + }, + { + "epoch": 0.6707648320228735, + "grad_norm": 0.44168850779533386, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0258, + "step": 23460 + }, + { + "epoch": 0.6710507505360972, + "grad_norm": 0.39990919828414917, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0403, + "step": 23470 + }, + { + "epoch": 0.671336669049321, + "grad_norm": 0.7526253461837769, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0334, + "step": 23480 + }, + { + "epoch": 0.6716225875625447, + "grad_norm": 0.4888451397418976, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0314, + "step": 23490 + }, + { + "epoch": 0.6719085060757684, + "grad_norm": 0.5732892751693726, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0277, + "step": 23500 + }, + { + "epoch": 0.6721944245889921, + "grad_norm": 0.5806633830070496, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0347, + "step": 23510 + }, + { + "epoch": 0.6724803431022158, + "grad_norm": 0.4336501657962799, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0369, + "step": 23520 + }, + { + "epoch": 0.6727662616154396, + "grad_norm": 0.47082582116127014, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0408, + "step": 23530 + }, + { + "epoch": 0.6730521801286633, + "grad_norm": 0.6571422815322876, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0316, + "step": 23540 + }, + { + "epoch": 0.6733380986418871, + "grad_norm": 0.4899539649486542, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0256, + "step": 23550 + }, + { + "epoch": 0.6736240171551108, + "grad_norm": 0.3201868236064911, + "learning_rate": 4.103441847743051e-06, + "loss": 0.029, + "step": 23560 + }, + { + "epoch": 0.6739099356683346, + "grad_norm": 0.4385588765144348, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0284, + "step": 23570 + }, + { + "epoch": 0.6741958541815583, + "grad_norm": 0.5079174637794495, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0298, + "step": 23580 + }, + { + "epoch": 0.6744817726947819, + "grad_norm": 0.609523355960846, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0313, + "step": 23590 + }, + { + "epoch": 0.6747676912080057, + "grad_norm": 0.487690269947052, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0246, + "step": 23600 + }, + { + "epoch": 0.6750536097212294, + "grad_norm": 0.5146880745887756, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0319, + "step": 23610 + }, + { + "epoch": 0.6753395282344532, + "grad_norm": 0.5848239064216614, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0282, + "step": 23620 + }, + { + "epoch": 0.6756254467476769, + "grad_norm": 0.7779616117477417, + "learning_rate": 4.05979084812184e-06, + "loss": 0.033, + "step": 23630 + }, + { + "epoch": 0.6759113652609007, + "grad_norm": 0.3329331576824188, + "learning_rate": 4.053587511509546e-06, + "loss": 0.028, + "step": 23640 + }, + { + "epoch": 0.6761972837741244, + "grad_norm": 0.4691336154937744, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0313, + "step": 23650 + }, + { + "epoch": 0.6764832022873482, + "grad_norm": 0.47258421778678894, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0326, + "step": 23660 + }, + { + "epoch": 0.6767691208005718, + "grad_norm": 0.5333718657493591, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0332, + "step": 23670 + }, + { + "epoch": 0.6770550393137955, + "grad_norm": 0.7278451323509216, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0409, + "step": 23680 + }, + { + "epoch": 0.6773409578270193, + "grad_norm": 0.41567277908325195, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0263, + "step": 23690 + }, + { + "epoch": 0.677626876340243, + "grad_norm": 0.4351106584072113, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0276, + "step": 23700 + }, + { + "epoch": 0.6779127948534668, + "grad_norm": 0.31096217036247253, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0339, + "step": 23710 + }, + { + "epoch": 0.6781987133666905, + "grad_norm": 0.6321837306022644, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0313, + "step": 23720 + }, + { + "epoch": 0.6784846318799143, + "grad_norm": 0.5278098583221436, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0294, + "step": 23730 + }, + { + "epoch": 0.6787705503931379, + "grad_norm": 0.5778757333755493, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0324, + "step": 23740 + }, + { + "epoch": 0.6790564689063617, + "grad_norm": 0.6164223551750183, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0316, + "step": 23750 + }, + { + "epoch": 0.6793423874195854, + "grad_norm": 0.2872319221496582, + "learning_rate": 3.979785400791052e-06, + "loss": 0.034, + "step": 23760 + }, + { + "epoch": 0.6796283059328091, + "grad_norm": 0.6088704466819763, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0317, + "step": 23770 + }, + { + "epoch": 0.6799142244460329, + "grad_norm": 0.4733040928840637, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0282, + "step": 23780 + }, + { + "epoch": 0.6802001429592566, + "grad_norm": 1.3417131900787354, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0304, + "step": 23790 + }, + { + "epoch": 0.6804860614724804, + "grad_norm": 0.7316146492958069, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0311, + "step": 23800 + }, + { + "epoch": 0.680771979985704, + "grad_norm": 0.5726248025894165, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0323, + "step": 23810 + }, + { + "epoch": 0.6810578984989278, + "grad_norm": 0.3990941345691681, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0277, + "step": 23820 + }, + { + "epoch": 0.6813438170121515, + "grad_norm": 0.49237731099128723, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0287, + "step": 23830 + }, + { + "epoch": 0.6816297355253753, + "grad_norm": 0.47560542821884155, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0298, + "step": 23840 + }, + { + "epoch": 0.681915654038599, + "grad_norm": 0.5967867374420166, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0295, + "step": 23850 + }, + { + "epoch": 0.6822015725518227, + "grad_norm": 0.5726722478866577, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0283, + "step": 23860 + }, + { + "epoch": 0.6824874910650465, + "grad_norm": 0.282678484916687, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0303, + "step": 23870 + }, + { + "epoch": 0.6827734095782702, + "grad_norm": 0.4432118237018585, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0296, + "step": 23880 + }, + { + "epoch": 0.683059328091494, + "grad_norm": 0.33677008748054504, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0379, + "step": 23890 + }, + { + "epoch": 0.6833452466047176, + "grad_norm": 0.5063587427139282, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0281, + "step": 23900 + }, + { + "epoch": 0.6836311651179414, + "grad_norm": 0.2592383921146393, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0263, + "step": 23910 + }, + { + "epoch": 0.6839170836311651, + "grad_norm": 0.4482796788215637, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0289, + "step": 23920 + }, + { + "epoch": 0.6842030021443889, + "grad_norm": 0.2609167993068695, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0294, + "step": 23930 + }, + { + "epoch": 0.6844889206576126, + "grad_norm": 0.36982619762420654, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0306, + "step": 23940 + }, + { + "epoch": 0.6847748391708363, + "grad_norm": 0.47758495807647705, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0273, + "step": 23950 + }, + { + "epoch": 0.68506075768406, + "grad_norm": 0.5566948652267456, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0318, + "step": 23960 + }, + { + "epoch": 0.6853466761972837, + "grad_norm": 0.7815461754798889, + "learning_rate": 3.853493736024934e-06, + "loss": 0.03, + "step": 23970 + }, + { + "epoch": 0.6856325947105075, + "grad_norm": 0.42888402938842773, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0384, + "step": 23980 + }, + { + "epoch": 0.6859185132237312, + "grad_norm": 0.47878748178482056, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0356, + "step": 23990 + }, + { + "epoch": 0.686204431736955, + "grad_norm": 0.3847522735595703, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0272, + "step": 24000 + }, + { + "epoch": 0.6864903502501787, + "grad_norm": 0.7005330920219421, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0314, + "step": 24010 + }, + { + "epoch": 0.6867762687634025, + "grad_norm": 0.7769733667373657, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0306, + "step": 24020 + }, + { + "epoch": 0.6870621872766262, + "grad_norm": 0.4073965847492218, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0286, + "step": 24030 + }, + { + "epoch": 0.6873481057898498, + "grad_norm": 0.6220553517341614, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0358, + "step": 24040 + }, + { + "epoch": 0.6876340243030736, + "grad_norm": 0.32508641481399536, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0284, + "step": 24050 + }, + { + "epoch": 0.6879199428162973, + "grad_norm": 0.4828036427497864, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0308, + "step": 24060 + }, + { + "epoch": 0.6882058613295211, + "grad_norm": 0.4809496998786926, + "learning_rate": 3.794650811106129e-06, + "loss": 0.028, + "step": 24070 + }, + { + "epoch": 0.6884917798427448, + "grad_norm": 0.8497998714447021, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.037, + "step": 24080 + }, + { + "epoch": 0.6887776983559686, + "grad_norm": 0.758666455745697, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0323, + "step": 24090 + }, + { + "epoch": 0.6890636168691923, + "grad_norm": 0.40550050139427185, + "learning_rate": 3.777162510056721e-06, + "loss": 0.0359, + "step": 24100 + }, + { + "epoch": 0.6893495353824161, + "grad_norm": 0.4595869779586792, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0304, + "step": 24110 + }, + { + "epoch": 0.6896354538956397, + "grad_norm": 0.5098794102668762, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0279, + "step": 24120 + }, + { + "epoch": 0.6899213724088634, + "grad_norm": 0.3320889174938202, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0287, + "step": 24130 + }, + { + "epoch": 0.6902072909220872, + "grad_norm": 0.4708438515663147, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0289, + "step": 24140 + }, + { + "epoch": 0.6904932094353109, + "grad_norm": 1.0990219116210938, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0303, + "step": 24150 + }, + { + "epoch": 0.6907791279485347, + "grad_norm": 0.5109107494354248, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0306, + "step": 24160 + }, + { + "epoch": 0.6910650464617584, + "grad_norm": 0.6247434616088867, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0332, + "step": 24170 + }, + { + "epoch": 0.6913509649749822, + "grad_norm": 0.4033079743385315, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0252, + "step": 24180 + }, + { + "epoch": 0.6916368834882058, + "grad_norm": 0.36993420124053955, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0287, + "step": 24190 + }, + { + "epoch": 0.6919228020014296, + "grad_norm": 0.37320762872695923, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0364, + "step": 24200 + }, + { + "epoch": 0.6922087205146533, + "grad_norm": 0.6411201357841492, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0306, + "step": 24210 + }, + { + "epoch": 0.692494639027877, + "grad_norm": 0.7033433318138123, + "learning_rate": 3.707974016467e-06, + "loss": 0.0334, + "step": 24220 + }, + { + "epoch": 0.6927805575411008, + "grad_norm": 0.5307570695877075, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0338, + "step": 24230 + }, + { + "epoch": 0.6930664760543245, + "grad_norm": 0.6726395487785339, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0379, + "step": 24240 + }, + { + "epoch": 0.6933523945675483, + "grad_norm": 0.5609936714172363, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0272, + "step": 24250 + }, + { + "epoch": 0.693638313080772, + "grad_norm": 0.5961005687713623, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0361, + "step": 24260 + }, + { + "epoch": 0.6939242315939957, + "grad_norm": 0.46744176745414734, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0291, + "step": 24270 + }, + { + "epoch": 0.6942101501072194, + "grad_norm": 0.5180732607841492, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0377, + "step": 24280 + }, + { + "epoch": 0.6944960686204432, + "grad_norm": 0.594201922416687, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0312, + "step": 24290 + }, + { + "epoch": 0.6947819871336669, + "grad_norm": 0.5852509140968323, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0303, + "step": 24300 + }, + { + "epoch": 0.6950679056468906, + "grad_norm": 0.7885274291038513, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0329, + "step": 24310 + }, + { + "epoch": 0.6953538241601144, + "grad_norm": 0.5280163884162903, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.031, + "step": 24320 + }, + { + "epoch": 0.6956397426733381, + "grad_norm": 0.6047127842903137, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0283, + "step": 24330 + }, + { + "epoch": 0.6959256611865619, + "grad_norm": 0.43192219734191895, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0338, + "step": 24340 + }, + { + "epoch": 0.6962115796997855, + "grad_norm": 0.3320246636867523, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0262, + "step": 24350 + }, + { + "epoch": 0.6964974982130093, + "grad_norm": 0.46365252137184143, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0342, + "step": 24360 + }, + { + "epoch": 0.696783416726233, + "grad_norm": 0.537933886051178, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0286, + "step": 24370 + }, + { + "epoch": 0.6970693352394568, + "grad_norm": 0.3574221134185791, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0342, + "step": 24380 + }, + { + "epoch": 0.6973552537526805, + "grad_norm": 0.7051029205322266, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0316, + "step": 24390 + }, + { + "epoch": 0.6976411722659042, + "grad_norm": 0.587533712387085, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0302, + "step": 24400 + }, + { + "epoch": 0.697927090779128, + "grad_norm": 0.555778980255127, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0301, + "step": 24410 + }, + { + "epoch": 0.6982130092923516, + "grad_norm": 0.44060736894607544, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0296, + "step": 24420 + }, + { + "epoch": 0.6984989278055754, + "grad_norm": 0.3930843472480774, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0327, + "step": 24430 + }, + { + "epoch": 0.6987848463187991, + "grad_norm": 0.8878913521766663, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0389, + "step": 24440 + }, + { + "epoch": 0.6990707648320229, + "grad_norm": 0.45810988545417786, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0322, + "step": 24450 + }, + { + "epoch": 0.6993566833452466, + "grad_norm": 0.41808775067329407, + "learning_rate": 3.573305344104808e-06, + "loss": 0.032, + "step": 24460 + }, + { + "epoch": 0.6996426018584704, + "grad_norm": 0.5060444474220276, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0317, + "step": 24470 + }, + { + "epoch": 0.6999285203716941, + "grad_norm": 0.28741514682769775, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0271, + "step": 24480 + }, + { + "epoch": 0.7002144388849177, + "grad_norm": 0.5564437508583069, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0279, + "step": 24490 + }, + { + "epoch": 0.7005003573981415, + "grad_norm": 0.43762925267219543, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0317, + "step": 24500 + }, + { + "epoch": 0.7007862759113652, + "grad_norm": 0.46590355038642883, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0314, + "step": 24510 + }, + { + "epoch": 0.701072194424589, + "grad_norm": 0.640477180480957, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0258, + "step": 24520 + }, + { + "epoch": 0.7013581129378127, + "grad_norm": 0.5845742225646973, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0283, + "step": 24530 + }, + { + "epoch": 0.7016440314510365, + "grad_norm": 0.5625128746032715, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0381, + "step": 24540 + }, + { + "epoch": 0.7019299499642602, + "grad_norm": 0.4365232586860657, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0297, + "step": 24550 + }, + { + "epoch": 0.702215868477484, + "grad_norm": 0.5942055583000183, + "learning_rate": 3.518669865884119e-06, + "loss": 0.034, + "step": 24560 + }, + { + "epoch": 0.7025017869907076, + "grad_norm": 0.3847256302833557, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0293, + "step": 24570 + }, + { + "epoch": 0.7027877055039313, + "grad_norm": 0.542539119720459, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0327, + "step": 24580 + }, + { + "epoch": 0.7030736240171551, + "grad_norm": 0.5383610129356384, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0322, + "step": 24590 + }, + { + "epoch": 0.7033595425303788, + "grad_norm": 0.6085273027420044, + "learning_rate": 3.497061149826966e-06, + "loss": 0.0293, + "step": 24600 + }, + { + "epoch": 0.7036454610436026, + "grad_norm": 0.5107666254043579, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0266, + "step": 24610 + }, + { + "epoch": 0.7039313795568263, + "grad_norm": 0.4976873993873596, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0301, + "step": 24620 + }, + { + "epoch": 0.7042172980700501, + "grad_norm": 0.5735257863998413, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0264, + "step": 24630 + }, + { + "epoch": 0.7045032165832738, + "grad_norm": 0.6035013794898987, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0286, + "step": 24640 + }, + { + "epoch": 0.7047891350964975, + "grad_norm": 0.5665635466575623, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0345, + "step": 24650 + }, + { + "epoch": 0.7050750536097212, + "grad_norm": 0.5783578753471375, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0385, + "step": 24660 + }, + { + "epoch": 0.7053609721229449, + "grad_norm": 0.3957138657569885, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.0319, + "step": 24670 + }, + { + "epoch": 0.7056468906361687, + "grad_norm": 0.32982495427131653, + "learning_rate": 3.454266765790622e-06, + "loss": 0.034, + "step": 24680 + }, + { + "epoch": 0.7059328091493924, + "grad_norm": 0.5827629566192627, + "learning_rate": 3.448957251110008e-06, + "loss": 0.029, + "step": 24690 + }, + { + "epoch": 0.7062187276626162, + "grad_norm": 0.28891173005104065, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0328, + "step": 24700 + }, + { + "epoch": 0.7065046461758399, + "grad_norm": 0.7992371320724487, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0333, + "step": 24710 + }, + { + "epoch": 0.7067905646890636, + "grad_norm": 0.5976162552833557, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0327, + "step": 24720 + }, + { + "epoch": 0.7070764832022873, + "grad_norm": 0.4785068929195404, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0334, + "step": 24730 + }, + { + "epoch": 0.7073624017155111, + "grad_norm": 0.6561854481697083, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0317, + "step": 24740 + }, + { + "epoch": 0.7076483202287348, + "grad_norm": 0.6745696067810059, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0289, + "step": 24750 + }, + { + "epoch": 0.7079342387419585, + "grad_norm": 0.4914945960044861, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0306, + "step": 24760 + }, + { + "epoch": 0.7082201572551823, + "grad_norm": 0.35789182782173157, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0327, + "step": 24770 + }, + { + "epoch": 0.708506075768406, + "grad_norm": 0.416161447763443, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0313, + "step": 24780 + }, + { + "epoch": 0.7087919942816298, + "grad_norm": 0.6271718740463257, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0314, + "step": 24790 + }, + { + "epoch": 0.7090779127948534, + "grad_norm": 0.5230259895324707, + "learning_rate": 3.391138816571675e-06, + "loss": 0.037, + "step": 24800 + }, + { + "epoch": 0.7093638313080772, + "grad_norm": 0.54779452085495, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0364, + "step": 24810 + }, + { + "epoch": 0.7096497498213009, + "grad_norm": 0.6326698064804077, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0287, + "step": 24820 + }, + { + "epoch": 0.7099356683345247, + "grad_norm": 0.576437771320343, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0321, + "step": 24830 + }, + { + "epoch": 0.7102215868477484, + "grad_norm": 0.49094530940055847, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0328, + "step": 24840 + }, + { + "epoch": 0.7105075053609721, + "grad_norm": 3.1826400756835938, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0497, + "step": 24850 + }, + { + "epoch": 0.7107934238741959, + "grad_norm": 0.6048339009284973, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0429, + "step": 24860 + }, + { + "epoch": 0.7110793423874195, + "grad_norm": 0.6633393168449402, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0287, + "step": 24870 + }, + { + "epoch": 0.7113652609006433, + "grad_norm": 0.24930168688297272, + "learning_rate": 3.349767211300933e-06, + "loss": 0.027, + "step": 24880 + }, + { + "epoch": 0.711651179413867, + "grad_norm": 0.3934503495693207, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0256, + "step": 24890 + }, + { + "epoch": 0.7119370979270908, + "grad_norm": 0.7811068892478943, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.03, + "step": 24900 + }, + { + "epoch": 0.7122230164403145, + "grad_norm": 0.4274163246154785, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0263, + "step": 24910 + }, + { + "epoch": 0.7125089349535383, + "grad_norm": 0.5188158750534058, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0264, + "step": 24920 + }, + { + "epoch": 0.712794853466762, + "grad_norm": 0.4106016457080841, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0309, + "step": 24930 + }, + { + "epoch": 0.7130807719799857, + "grad_norm": 0.5283434987068176, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0335, + "step": 24940 + }, + { + "epoch": 0.7133666904932094, + "grad_norm": 0.38160789012908936, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0313, + "step": 24950 + }, + { + "epoch": 0.7136526090064331, + "grad_norm": 0.30552029609680176, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0265, + "step": 24960 + }, + { + "epoch": 0.7139385275196569, + "grad_norm": 0.40023618936538696, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0295, + "step": 24970 + }, + { + "epoch": 0.7142244460328806, + "grad_norm": 0.3569220006465912, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0259, + "step": 24980 + }, + { + "epoch": 0.7145103645461044, + "grad_norm": 0.39430442452430725, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0313, + "step": 24990 + }, + { + "epoch": 0.7147962830593281, + "grad_norm": 0.5891808271408081, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0272, + "step": 25000 + }, + { + "epoch": 0.7150822015725519, + "grad_norm": 0.487945556640625, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0308, + "step": 25010 + }, + { + "epoch": 0.7153681200857755, + "grad_norm": 0.551268458366394, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.026, + "step": 25020 + }, + { + "epoch": 0.7156540385989992, + "grad_norm": 0.7384896278381348, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0371, + "step": 25030 + }, + { + "epoch": 0.715939957112223, + "grad_norm": 0.43013718724250793, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0348, + "step": 25040 + }, + { + "epoch": 0.7162258756254467, + "grad_norm": 0.28747591376304626, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0249, + "step": 25050 + }, + { + "epoch": 0.7165117941386705, + "grad_norm": 0.48107975721359253, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0247, + "step": 25060 + }, + { + "epoch": 0.7167977126518942, + "grad_norm": 0.4077073931694031, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0313, + "step": 25070 + }, + { + "epoch": 0.717083631165118, + "grad_norm": 0.7853788137435913, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0286, + "step": 25080 + }, + { + "epoch": 0.7173695496783417, + "grad_norm": 0.6021899580955505, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0316, + "step": 25090 + }, + { + "epoch": 0.7176554681915654, + "grad_norm": 0.5997788906097412, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0286, + "step": 25100 + }, + { + "epoch": 0.7179413867047891, + "grad_norm": 0.47682714462280273, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0329, + "step": 25110 + }, + { + "epoch": 0.7182273052180128, + "grad_norm": 0.6501848697662354, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0289, + "step": 25120 + }, + { + "epoch": 0.7185132237312366, + "grad_norm": 1.000689148902893, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0351, + "step": 25130 + }, + { + "epoch": 0.7187991422444603, + "grad_norm": 0.5946705937385559, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0268, + "step": 25140 + }, + { + "epoch": 0.7190850607576841, + "grad_norm": 0.46967631578445435, + "learning_rate": 3.214397932123149e-06, + "loss": 0.031, + "step": 25150 + }, + { + "epoch": 0.7193709792709078, + "grad_norm": 1.052093744277954, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0302, + "step": 25160 + }, + { + "epoch": 0.7196568977841316, + "grad_norm": 0.9337649941444397, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0304, + "step": 25170 + }, + { + "epoch": 0.7199428162973552, + "grad_norm": 0.423648864030838, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0297, + "step": 25180 + }, + { + "epoch": 0.720228734810579, + "grad_norm": 0.46862924098968506, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.028, + "step": 25190 + }, + { + "epoch": 0.7205146533238027, + "grad_norm": 0.7099304795265198, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0306, + "step": 25200 + }, + { + "epoch": 0.7208005718370264, + "grad_norm": 0.5219885110855103, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0269, + "step": 25210 + }, + { + "epoch": 0.7210864903502502, + "grad_norm": 0.6347305774688721, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0307, + "step": 25220 + }, + { + "epoch": 0.7213724088634739, + "grad_norm": 0.7043943405151367, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0267, + "step": 25230 + }, + { + "epoch": 0.7216583273766977, + "grad_norm": 0.4137915074825287, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.028, + "step": 25240 + }, + { + "epoch": 0.7219442458899213, + "grad_norm": 0.4374844431877136, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0246, + "step": 25250 + }, + { + "epoch": 0.7222301644031451, + "grad_norm": 0.6796316504478455, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0285, + "step": 25260 + }, + { + "epoch": 0.7225160829163688, + "grad_norm": 0.4662792980670929, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0287, + "step": 25270 + }, + { + "epoch": 0.7228020014295926, + "grad_norm": 0.4035339653491974, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0289, + "step": 25280 + }, + { + "epoch": 0.7230879199428163, + "grad_norm": 0.40217533707618713, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0238, + "step": 25290 + }, + { + "epoch": 0.72337383845604, + "grad_norm": 0.3640667796134949, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0303, + "step": 25300 + }, + { + "epoch": 0.7236597569692638, + "grad_norm": 0.38176655769348145, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.0283, + "step": 25310 + }, + { + "epoch": 0.7239456754824874, + "grad_norm": 0.40747207403182983, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.031, + "step": 25320 + }, + { + "epoch": 0.7242315939957112, + "grad_norm": 0.3859431743621826, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0306, + "step": 25330 + }, + { + "epoch": 0.7245175125089349, + "grad_norm": 0.23738636076450348, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0313, + "step": 25340 + }, + { + "epoch": 0.7248034310221587, + "grad_norm": 0.3772980272769928, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0308, + "step": 25350 + }, + { + "epoch": 0.7250893495353824, + "grad_norm": 0.5451138019561768, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.03, + "step": 25360 + }, + { + "epoch": 0.7253752680486062, + "grad_norm": 0.6431843638420105, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0371, + "step": 25370 + }, + { + "epoch": 0.7256611865618299, + "grad_norm": 0.42552369832992554, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0311, + "step": 25380 + }, + { + "epoch": 0.7259471050750536, + "grad_norm": 0.5802433490753174, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0316, + "step": 25390 + }, + { + "epoch": 0.7262330235882773, + "grad_norm": 0.31489041447639465, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0282, + "step": 25400 + }, + { + "epoch": 0.726518942101501, + "grad_norm": 0.4227478504180908, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0274, + "step": 25410 + }, + { + "epoch": 0.7268048606147248, + "grad_norm": 0.5510851740837097, + "learning_rate": 3.085688933413021e-06, + "loss": 0.0297, + "step": 25420 + }, + { + "epoch": 0.7270907791279485, + "grad_norm": 0.3073323667049408, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0244, + "step": 25430 + }, + { + "epoch": 0.7273766976411723, + "grad_norm": 0.7394781112670898, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.028, + "step": 25440 + }, + { + "epoch": 0.727662616154396, + "grad_norm": 0.5067957639694214, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0374, + "step": 25450 + }, + { + "epoch": 0.7279485346676198, + "grad_norm": 0.4093882739543915, + "learning_rate": 3.067194157156521e-06, + "loss": 0.0347, + "step": 25460 + }, + { + "epoch": 0.7282344531808435, + "grad_norm": 0.37054866552352905, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0305, + "step": 25470 + }, + { + "epoch": 0.7285203716940671, + "grad_norm": 0.38795027136802673, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0282, + "step": 25480 + }, + { + "epoch": 0.7288062902072909, + "grad_norm": 0.49282407760620117, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0301, + "step": 25490 + }, + { + "epoch": 0.7290922087205146, + "grad_norm": 0.5234564542770386, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0352, + "step": 25500 + }, + { + "epoch": 0.7293781272337384, + "grad_norm": 0.5383297801017761, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0317, + "step": 25510 + }, + { + "epoch": 0.7296640457469621, + "grad_norm": 0.4277333617210388, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0278, + "step": 25520 + }, + { + "epoch": 0.7299499642601859, + "grad_norm": 0.6099430322647095, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0356, + "step": 25530 + }, + { + "epoch": 0.7302358827734096, + "grad_norm": 0.38870710134506226, + "learning_rate": 3.030651808761638e-06, + "loss": 0.027, + "step": 25540 + }, + { + "epoch": 0.7305218012866334, + "grad_norm": 0.48884090781211853, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0251, + "step": 25550 + }, + { + "epoch": 0.730807719799857, + "grad_norm": 0.5136672258377075, + "learning_rate": 3.021609639602321e-06, + "loss": 0.025, + "step": 25560 + }, + { + "epoch": 0.7310936383130807, + "grad_norm": 0.527056872844696, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.03, + "step": 25570 + }, + { + "epoch": 0.7313795568263045, + "grad_norm": 0.7081360220909119, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0303, + "step": 25580 + }, + { + "epoch": 0.7316654753395282, + "grad_norm": 0.48397257924079895, + "learning_rate": 3.008116622200155e-06, + "loss": 0.032, + "step": 25590 + }, + { + "epoch": 0.731951393852752, + "grad_norm": 0.38431495428085327, + "learning_rate": 3.003637700546652e-06, + "loss": 0.0337, + "step": 25600 + }, + { + "epoch": 0.7322373123659757, + "grad_norm": 0.48320460319519043, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0336, + "step": 25610 + }, + { + "epoch": 0.7325232308791995, + "grad_norm": 0.3164500892162323, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0244, + "step": 25620 + }, + { + "epoch": 0.7328091493924231, + "grad_norm": 0.5140587091445923, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0267, + "step": 25630 + }, + { + "epoch": 0.7330950679056469, + "grad_norm": 0.30739104747772217, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0373, + "step": 25640 + }, + { + "epoch": 0.7333809864188706, + "grad_norm": 0.3579956591129303, + "learning_rate": 2.981383959667165e-06, + "loss": 0.0328, + "step": 25650 + }, + { + "epoch": 0.7336669049320943, + "grad_norm": 0.7733256220817566, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0335, + "step": 25660 + }, + { + "epoch": 0.7339528234453181, + "grad_norm": 0.5355008244514465, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0291, + "step": 25670 + }, + { + "epoch": 0.7342387419585418, + "grad_norm": 0.5733621120452881, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0223, + "step": 25680 + }, + { + "epoch": 0.7345246604717656, + "grad_norm": 0.4484233260154724, + "learning_rate": 2.963750320724704e-06, + "loss": 0.03, + "step": 25690 + }, + { + "epoch": 0.7348105789849892, + "grad_norm": 0.46975597739219666, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0325, + "step": 25700 + }, + { + "epoch": 0.735096497498213, + "grad_norm": 0.4674699008464813, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0321, + "step": 25710 + }, + { + "epoch": 0.7353824160114367, + "grad_norm": 0.301565557718277, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0279, + "step": 25720 + }, + { + "epoch": 0.7356683345246605, + "grad_norm": 0.41966041922569275, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0319, + "step": 25730 + }, + { + "epoch": 0.7359542530378842, + "grad_norm": 0.5388277173042297, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0287, + "step": 25740 + }, + { + "epoch": 0.7362401715511079, + "grad_norm": 0.5821589231491089, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0298, + "step": 25750 + }, + { + "epoch": 0.7365260900643317, + "grad_norm": 0.9340733289718628, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0307, + "step": 25760 + }, + { + "epoch": 0.7368120085775554, + "grad_norm": 0.3654371201992035, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0284, + "step": 25770 + }, + { + "epoch": 0.7370979270907791, + "grad_norm": 0.38794293999671936, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0306, + "step": 25780 + }, + { + "epoch": 0.7373838456040028, + "grad_norm": 0.39955422282218933, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.0324, + "step": 25790 + }, + { + "epoch": 0.7376697641172266, + "grad_norm": 0.5864313244819641, + "learning_rate": 2.916036854664115e-06, + "loss": 0.031, + "step": 25800 + }, + { + "epoch": 0.7379556826304503, + "grad_norm": 0.4324203431606293, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0253, + "step": 25810 + }, + { + "epoch": 0.7382416011436741, + "grad_norm": 0.6346203684806824, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0278, + "step": 25820 + }, + { + "epoch": 0.7385275196568978, + "grad_norm": 0.3984649181365967, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0352, + "step": 25830 + }, + { + "epoch": 0.7388134381701215, + "grad_norm": 0.3954542577266693, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0305, + "step": 25840 + }, + { + "epoch": 0.7390993566833453, + "grad_norm": 0.3119542598724365, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0372, + "step": 25850 + }, + { + "epoch": 0.7393852751965689, + "grad_norm": 0.4094623029232025, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0251, + "step": 25860 + }, + { + "epoch": 0.7396711937097927, + "grad_norm": 0.5250104665756226, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0302, + "step": 25870 + }, + { + "epoch": 0.7399571122230164, + "grad_norm": 0.7610230445861816, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0257, + "step": 25880 + }, + { + "epoch": 0.7402430307362402, + "grad_norm": 0.5546014904975891, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0249, + "step": 25890 + }, + { + "epoch": 0.7405289492494639, + "grad_norm": 0.22835634648799896, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0287, + "step": 25900 + }, + { + "epoch": 0.7408148677626877, + "grad_norm": 0.7073826789855957, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0409, + "step": 25910 + }, + { + "epoch": 0.7411007862759114, + "grad_norm": 0.604634165763855, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0293, + "step": 25920 + }, + { + "epoch": 0.741386704789135, + "grad_norm": 0.46605581045150757, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0261, + "step": 25930 + }, + { + "epoch": 0.7416726233023588, + "grad_norm": 0.35719090700149536, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0304, + "step": 25940 + }, + { + "epoch": 0.7419585418155825, + "grad_norm": 0.3806651532649994, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0313, + "step": 25950 + }, + { + "epoch": 0.7422444603288063, + "grad_norm": 0.6443240642547607, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0303, + "step": 25960 + }, + { + "epoch": 0.74253037884203, + "grad_norm": 0.42187514901161194, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0282, + "step": 25970 + }, + { + "epoch": 0.7428162973552538, + "grad_norm": 0.4213440418243408, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0312, + "step": 25980 + }, + { + "epoch": 0.7431022158684775, + "grad_norm": 0.3982003331184387, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0279, + "step": 25990 + }, + { + "epoch": 0.7433881343817013, + "grad_norm": 0.3418596386909485, + "learning_rate": 2.832230653119002e-06, + "loss": 0.0318, + "step": 26000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.624061151019008e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..69a1d5b85ede7c969e604b5d8b62e78a67f7f018 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c8ff0897ee1b13ebd0accc6092502fdbd99e5eeda135e6ba957e6b65dd58adb +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..532692b4e88ff317d18bc5de6bdcee4e338e40b1 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9d735c2ced6b446ad0c151fc881afc213e03f0b87ed6dcbd9be7756614cd4f4 +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..54f1305afa65f84d1215c2b1d001ac6accbf18a4 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:add92d4c17ad323e4714c99fb7f098c393e4a29466defbcb7ce335458e219b4d +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0e452e5d4e75a56ef7615795d1656a47d3616b4c --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json @@ -0,0 +1,19634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8005718370264474, + "eval_steps": 500, + "global_step": 28000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + }, + { + "epoch": 0.40057183702644744, + "grad_norm": 0.9077253937721252, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0415, + "step": 14010 + }, + { + "epoch": 0.4008577555396712, + "grad_norm": 0.9126781225204468, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.048, + "step": 14020 + }, + { + "epoch": 0.4011436740528949, + "grad_norm": 0.6264623999595642, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0411, + "step": 14030 + }, + { + "epoch": 0.40142959256611865, + "grad_norm": 0.523853600025177, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.051, + "step": 14040 + }, + { + "epoch": 0.4017155110793424, + "grad_norm": 0.6340035200119019, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0426, + "step": 14050 + }, + { + "epoch": 0.40200142959256613, + "grad_norm": 0.3594725430011749, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0397, + "step": 14060 + }, + { + "epoch": 0.40228734810578987, + "grad_norm": 0.941470742225647, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0402, + "step": 14070 + }, + { + "epoch": 0.4025732666190136, + "grad_norm": 0.840506911277771, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0473, + "step": 14080 + }, + { + "epoch": 0.4028591851322373, + "grad_norm": 0.3359200954437256, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0405, + "step": 14090 + }, + { + "epoch": 0.403145103645461, + "grad_norm": 0.49658629298210144, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0464, + "step": 14100 + }, + { + "epoch": 0.40343102215868476, + "grad_norm": 0.7940187454223633, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0417, + "step": 14110 + }, + { + "epoch": 0.4037169406719085, + "grad_norm": 0.30110660195350647, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0371, + "step": 14120 + }, + { + "epoch": 0.40400285918513223, + "grad_norm": 0.42845240235328674, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.053, + "step": 14130 + }, + { + "epoch": 0.40428877769835597, + "grad_norm": 0.997348427772522, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.041, + "step": 14140 + }, + { + "epoch": 0.4045746962115797, + "grad_norm": 0.4759966731071472, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0377, + "step": 14150 + }, + { + "epoch": 0.40486061472480345, + "grad_norm": 0.42045602202415466, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0397, + "step": 14160 + }, + { + "epoch": 0.4051465332380272, + "grad_norm": 0.6400002837181091, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0507, + "step": 14170 + }, + { + "epoch": 0.40543245175125087, + "grad_norm": 0.5473673939704895, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0359, + "step": 14180 + }, + { + "epoch": 0.4057183702644746, + "grad_norm": 0.7414730787277222, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0416, + "step": 14190 + }, + { + "epoch": 0.40600428877769834, + "grad_norm": 0.4691861867904663, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0363, + "step": 14200 + }, + { + "epoch": 0.4062902072909221, + "grad_norm": 0.9186112880706787, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0445, + "step": 14210 + }, + { + "epoch": 0.4065761258041458, + "grad_norm": 0.6782190203666687, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.40686204431736955, + "grad_norm": 0.6948013305664062, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.037, + "step": 14230 + }, + { + "epoch": 0.4071479628305933, + "grad_norm": 0.3034680485725403, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0371, + "step": 14240 + }, + { + "epoch": 0.40743388134381703, + "grad_norm": 0.4254174828529358, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0449, + "step": 14250 + }, + { + "epoch": 0.40771979985704077, + "grad_norm": 1.3622064590454102, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0428, + "step": 14260 + }, + { + "epoch": 0.40800571837026445, + "grad_norm": 0.5928359031677246, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0424, + "step": 14270 + }, + { + "epoch": 0.4082916368834882, + "grad_norm": 0.9103132486343384, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0414, + "step": 14280 + }, + { + "epoch": 0.4085775553967119, + "grad_norm": 0.6338028311729431, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0376, + "step": 14290 + }, + { + "epoch": 0.40886347390993566, + "grad_norm": 0.9920284748077393, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0393, + "step": 14300 + }, + { + "epoch": 0.4091493924231594, + "grad_norm": 0.411830335855484, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0336, + "step": 14310 + }, + { + "epoch": 0.40943531093638313, + "grad_norm": 0.6977682709693909, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0454, + "step": 14320 + }, + { + "epoch": 0.40972122944960687, + "grad_norm": 0.6303663849830627, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0453, + "step": 14330 + }, + { + "epoch": 0.4100071479628306, + "grad_norm": 0.3048207759857178, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0373, + "step": 14340 + }, + { + "epoch": 0.41029306647605435, + "grad_norm": 0.7683395743370056, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0438, + "step": 14350 + }, + { + "epoch": 0.41057898498927803, + "grad_norm": 0.5791511535644531, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0392, + "step": 14360 + }, + { + "epoch": 0.41086490350250177, + "grad_norm": 0.876626193523407, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0324, + "step": 14370 + }, + { + "epoch": 0.4111508220157255, + "grad_norm": 0.5971815586090088, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0368, + "step": 14380 + }, + { + "epoch": 0.41143674052894924, + "grad_norm": 0.6508862376213074, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0411, + "step": 14390 + }, + { + "epoch": 0.411722659042173, + "grad_norm": 0.4704359471797943, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0351, + "step": 14400 + }, + { + "epoch": 0.4120085775553967, + "grad_norm": 0.4266453683376312, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0367, + "step": 14410 + }, + { + "epoch": 0.41229449606862045, + "grad_norm": 0.5898434519767761, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0376, + "step": 14420 + }, + { + "epoch": 0.4125804145818442, + "grad_norm": 0.8741532564163208, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0419, + "step": 14430 + }, + { + "epoch": 0.41286633309506793, + "grad_norm": 0.24328190088272095, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0333, + "step": 14440 + }, + { + "epoch": 0.4131522516082916, + "grad_norm": 0.4263601303100586, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.039, + "step": 14450 + }, + { + "epoch": 0.41343817012151535, + "grad_norm": 0.6311615109443665, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0454, + "step": 14460 + }, + { + "epoch": 0.4137240886347391, + "grad_norm": 0.7424519658088684, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0392, + "step": 14470 + }, + { + "epoch": 0.4140100071479628, + "grad_norm": 0.48323145508766174, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0374, + "step": 14480 + }, + { + "epoch": 0.41429592566118656, + "grad_norm": 0.38597407937049866, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0393, + "step": 14490 + }, + { + "epoch": 0.4145818441744103, + "grad_norm": 0.7251518964767456, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0431, + "step": 14500 + }, + { + "epoch": 0.41486776268763403, + "grad_norm": 0.44361060857772827, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0426, + "step": 14510 + }, + { + "epoch": 0.41515368120085777, + "grad_norm": 0.5625014305114746, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0372, + "step": 14520 + }, + { + "epoch": 0.4154395997140815, + "grad_norm": 0.27855798602104187, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 0.4157255182273052, + "grad_norm": 0.5966296195983887, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0387, + "step": 14540 + }, + { + "epoch": 0.41601143674052893, + "grad_norm": 0.49445512890815735, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0355, + "step": 14550 + }, + { + "epoch": 0.41629735525375267, + "grad_norm": 0.3813278377056122, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0456, + "step": 14560 + }, + { + "epoch": 0.4165832737669764, + "grad_norm": 0.5962988138198853, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0401, + "step": 14570 + }, + { + "epoch": 0.41686919228020014, + "grad_norm": 0.4028547406196594, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0371, + "step": 14580 + }, + { + "epoch": 0.4171551107934239, + "grad_norm": 1.348706841468811, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0426, + "step": 14590 + }, + { + "epoch": 0.4174410293066476, + "grad_norm": 1.2782070636749268, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0393, + "step": 14600 + }, + { + "epoch": 0.41772694781987135, + "grad_norm": 1.0024999380111694, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0436, + "step": 14610 + }, + { + "epoch": 0.4180128663330951, + "grad_norm": 0.35450127720832825, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0411, + "step": 14620 + }, + { + "epoch": 0.41829878484631877, + "grad_norm": 0.5827250480651855, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0372, + "step": 14630 + }, + { + "epoch": 0.4185847033595425, + "grad_norm": 0.5905774235725403, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0394, + "step": 14640 + }, + { + "epoch": 0.41887062187276625, + "grad_norm": 0.652074933052063, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0405, + "step": 14650 + }, + { + "epoch": 0.41915654038599, + "grad_norm": 0.7245490550994873, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0473, + "step": 14660 + }, + { + "epoch": 0.4194424588992137, + "grad_norm": 0.5153012871742249, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.043, + "step": 14670 + }, + { + "epoch": 0.41972837741243746, + "grad_norm": 0.516107976436615, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0434, + "step": 14680 + }, + { + "epoch": 0.4200142959256612, + "grad_norm": 0.4743354618549347, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0429, + "step": 14690 + }, + { + "epoch": 0.42030021443888493, + "grad_norm": 0.547875165939331, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0395, + "step": 14700 + }, + { + "epoch": 0.42058613295210867, + "grad_norm": 0.6398400068283081, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0384, + "step": 14710 + }, + { + "epoch": 0.42087205146533235, + "grad_norm": 0.5891467332839966, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0399, + "step": 14720 + }, + { + "epoch": 0.4211579699785561, + "grad_norm": 0.3927595615386963, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0353, + "step": 14730 + }, + { + "epoch": 0.42144388849177983, + "grad_norm": 0.6477030515670776, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0492, + "step": 14740 + }, + { + "epoch": 0.42172980700500357, + "grad_norm": 0.7090615034103394, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.042, + "step": 14750 + }, + { + "epoch": 0.4220157255182273, + "grad_norm": 0.6572134494781494, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0406, + "step": 14760 + }, + { + "epoch": 0.42230164403145104, + "grad_norm": 0.787663996219635, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0424, + "step": 14770 + }, + { + "epoch": 0.4225875625446748, + "grad_norm": 0.8419309258460999, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0427, + "step": 14780 + }, + { + "epoch": 0.4228734810578985, + "grad_norm": 0.6204128861427307, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0364, + "step": 14790 + }, + { + "epoch": 0.42315939957112225, + "grad_norm": 0.7446070313453674, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 0.42344531808434593, + "grad_norm": 0.7446451783180237, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0384, + "step": 14810 + }, + { + "epoch": 0.42373123659756967, + "grad_norm": 0.6946475505828857, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0375, + "step": 14820 + }, + { + "epoch": 0.4240171551107934, + "grad_norm": 0.6997008323669434, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0393, + "step": 14830 + }, + { + "epoch": 0.42430307362401715, + "grad_norm": 0.4857316315174103, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0474, + "step": 14840 + }, + { + "epoch": 0.4245889921372409, + "grad_norm": 1.3516888618469238, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.047, + "step": 14850 + }, + { + "epoch": 0.4248749106504646, + "grad_norm": 0.40320220589637756, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0418, + "step": 14860 + }, + { + "epoch": 0.42516082916368836, + "grad_norm": 0.9002796411514282, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0434, + "step": 14870 + }, + { + "epoch": 0.4254467476769121, + "grad_norm": 0.3810071349143982, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0338, + "step": 14880 + }, + { + "epoch": 0.42573266619013583, + "grad_norm": 0.5786157250404358, + "learning_rate": 1.159527607963768e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.4260185847033595, + "grad_norm": 0.6316869258880615, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0388, + "step": 14900 + }, + { + "epoch": 0.42630450321658325, + "grad_norm": 0.608745276927948, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0426, + "step": 14910 + }, + { + "epoch": 0.426590421729807, + "grad_norm": 0.6655036807060242, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0433, + "step": 14920 + }, + { + "epoch": 0.4268763402430307, + "grad_norm": 0.29059523344039917, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0507, + "step": 14930 + }, + { + "epoch": 0.42716225875625446, + "grad_norm": 0.9066076278686523, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.4274481772694782, + "grad_norm": 1.0660220384597778, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0512, + "step": 14950 + }, + { + "epoch": 0.42773409578270194, + "grad_norm": 0.6081144213676453, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0426, + "step": 14960 + }, + { + "epoch": 0.4280200142959257, + "grad_norm": 0.46524369716644287, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0435, + "step": 14970 + }, + { + "epoch": 0.4283059328091494, + "grad_norm": 0.3497388958930969, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0492, + "step": 14980 + }, + { + "epoch": 0.4285918513223731, + "grad_norm": 0.41300803422927856, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 0.42887776983559683, + "grad_norm": 0.4363289177417755, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0358, + "step": 15000 + }, + { + "epoch": 0.42916368834882057, + "grad_norm": 1.314915418624878, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.047, + "step": 15010 + }, + { + "epoch": 0.4294496068620443, + "grad_norm": 0.558199942111969, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0313, + "step": 15020 + }, + { + "epoch": 0.42973552537526805, + "grad_norm": 0.3857463598251343, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0416, + "step": 15030 + }, + { + "epoch": 0.4300214438884918, + "grad_norm": 0.4701749384403229, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0425, + "step": 15040 + }, + { + "epoch": 0.4303073624017155, + "grad_norm": 0.4611213803291321, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0457, + "step": 15050 + }, + { + "epoch": 0.43059328091493926, + "grad_norm": 0.5338016152381897, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.038, + "step": 15060 + }, + { + "epoch": 0.430879199428163, + "grad_norm": 0.9078943133354187, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0395, + "step": 15070 + }, + { + "epoch": 0.4311651179413867, + "grad_norm": 0.5354048013687134, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0403, + "step": 15080 + }, + { + "epoch": 0.4314510364546104, + "grad_norm": 0.35511279106140137, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0377, + "step": 15090 + }, + { + "epoch": 0.43173695496783415, + "grad_norm": 0.37104350328445435, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0426, + "step": 15100 + }, + { + "epoch": 0.4320228734810579, + "grad_norm": 0.8916210532188416, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0387, + "step": 15110 + }, + { + "epoch": 0.4323087919942816, + "grad_norm": 0.514994740486145, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0384, + "step": 15120 + }, + { + "epoch": 0.43259471050750536, + "grad_norm": 0.8440690040588379, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0437, + "step": 15130 + }, + { + "epoch": 0.4328806290207291, + "grad_norm": 0.6815949082374573, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0453, + "step": 15140 + }, + { + "epoch": 0.43316654753395284, + "grad_norm": 0.33178189396858215, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0351, + "step": 15150 + }, + { + "epoch": 0.4334524660471766, + "grad_norm": 0.5686727166175842, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0368, + "step": 15160 + }, + { + "epoch": 0.43373838456040026, + "grad_norm": 0.44143930077552795, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0443, + "step": 15170 + }, + { + "epoch": 0.434024303073624, + "grad_norm": 0.3238232135772705, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0348, + "step": 15180 + }, + { + "epoch": 0.43431022158684773, + "grad_norm": 0.5038242340087891, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0343, + "step": 15190 + }, + { + "epoch": 0.43459614010007147, + "grad_norm": 0.4904351234436035, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0397, + "step": 15200 + }, + { + "epoch": 0.4348820586132952, + "grad_norm": 0.5325750708580017, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0499, + "step": 15210 + }, + { + "epoch": 0.43516797712651895, + "grad_norm": 0.39443954825401306, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 0.4354538956397427, + "grad_norm": 0.6782003045082092, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0358, + "step": 15230 + }, + { + "epoch": 0.4357398141529664, + "grad_norm": 0.47862571477890015, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0418, + "step": 15240 + }, + { + "epoch": 0.43602573266619016, + "grad_norm": 1.6515535116195679, + "learning_rate": 1.124468908014616e-05, + "loss": 0.043, + "step": 15250 + }, + { + "epoch": 0.43631165117941384, + "grad_norm": 0.4902660846710205, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0371, + "step": 15260 + }, + { + "epoch": 0.4365975696926376, + "grad_norm": 0.5742762088775635, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0369, + "step": 15270 + }, + { + "epoch": 0.4368834882058613, + "grad_norm": 0.42058590054512024, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0378, + "step": 15280 + }, + { + "epoch": 0.43716940671908505, + "grad_norm": 0.43729284405708313, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0352, + "step": 15290 + }, + { + "epoch": 0.4374553252323088, + "grad_norm": 0.4689466953277588, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0433, + "step": 15300 + }, + { + "epoch": 0.4377412437455325, + "grad_norm": 0.6272432208061218, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0548, + "step": 15310 + }, + { + "epoch": 0.43802716225875626, + "grad_norm": 1.1129611730575562, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0437, + "step": 15320 + }, + { + "epoch": 0.43831308077198, + "grad_norm": 0.9332655072212219, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0503, + "step": 15330 + }, + { + "epoch": 0.43859899928520374, + "grad_norm": 0.35150477290153503, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0351, + "step": 15340 + }, + { + "epoch": 0.4388849177984274, + "grad_norm": 0.3826565444469452, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0361, + "step": 15350 + }, + { + "epoch": 0.43917083631165116, + "grad_norm": 0.817319393157959, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0352, + "step": 15360 + }, + { + "epoch": 0.4394567548248749, + "grad_norm": 0.4379598796367645, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0469, + "step": 15370 + }, + { + "epoch": 0.43974267333809863, + "grad_norm": 0.6475314497947693, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0456, + "step": 15380 + }, + { + "epoch": 0.44002859185132237, + "grad_norm": 0.529088020324707, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0453, + "step": 15390 + }, + { + "epoch": 0.4403145103645461, + "grad_norm": 0.4915194809436798, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0369, + "step": 15400 + }, + { + "epoch": 0.44060042887776985, + "grad_norm": 0.4766380786895752, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0391, + "step": 15410 + }, + { + "epoch": 0.4408863473909936, + "grad_norm": 0.34667786955833435, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0327, + "step": 15420 + }, + { + "epoch": 0.4411722659042173, + "grad_norm": 0.504242479801178, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0413, + "step": 15430 + }, + { + "epoch": 0.441458184417441, + "grad_norm": 0.49786439538002014, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0361, + "step": 15440 + }, + { + "epoch": 0.44174410293066474, + "grad_norm": 0.4997329115867615, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0368, + "step": 15450 + }, + { + "epoch": 0.4420300214438885, + "grad_norm": 0.2992185056209564, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0359, + "step": 15460 + }, + { + "epoch": 0.4423159399571122, + "grad_norm": 0.6645393371582031, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0401, + "step": 15470 + }, + { + "epoch": 0.44260185847033595, + "grad_norm": 0.6327983140945435, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0386, + "step": 15480 + }, + { + "epoch": 0.4428877769835597, + "grad_norm": 0.45607903599739075, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 0.4431736954967834, + "grad_norm": 0.4401610493659973, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0417, + "step": 15500 + }, + { + "epoch": 0.44345961401000716, + "grad_norm": 0.5778466463088989, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0417, + "step": 15510 + }, + { + "epoch": 0.4437455325232309, + "grad_norm": 0.2164914309978485, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0355, + "step": 15520 + }, + { + "epoch": 0.4440314510364546, + "grad_norm": 0.3869318664073944, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0361, + "step": 15530 + }, + { + "epoch": 0.4443173695496783, + "grad_norm": 0.3843154311180115, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0459, + "step": 15540 + }, + { + "epoch": 0.44460328806290206, + "grad_norm": 0.8488825559616089, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0406, + "step": 15550 + }, + { + "epoch": 0.4448892065761258, + "grad_norm": 0.5055183172225952, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0359, + "step": 15560 + }, + { + "epoch": 0.44517512508934953, + "grad_norm": 0.40923011302948, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0435, + "step": 15570 + }, + { + "epoch": 0.44546104360257327, + "grad_norm": 0.48997730016708374, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0395, + "step": 15580 + }, + { + "epoch": 0.445746962115797, + "grad_norm": 0.5149131417274475, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.041, + "step": 15590 + }, + { + "epoch": 0.44603288062902074, + "grad_norm": 0.7277303338050842, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0452, + "step": 15600 + }, + { + "epoch": 0.4463187991422445, + "grad_norm": 0.48676377534866333, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0363, + "step": 15610 + }, + { + "epoch": 0.44660471765546816, + "grad_norm": 0.49031221866607666, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0356, + "step": 15620 + }, + { + "epoch": 0.4468906361686919, + "grad_norm": 0.38877514004707336, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.036, + "step": 15630 + }, + { + "epoch": 0.44717655468191564, + "grad_norm": 0.570068895816803, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0403, + "step": 15640 + }, + { + "epoch": 0.4474624731951394, + "grad_norm": 0.48499882221221924, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0395, + "step": 15650 + }, + { + "epoch": 0.4477483917083631, + "grad_norm": 0.7251732349395752, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0399, + "step": 15660 + }, + { + "epoch": 0.44803431022158685, + "grad_norm": 0.3927334249019623, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0359, + "step": 15670 + }, + { + "epoch": 0.4483202287348106, + "grad_norm": 0.5614549517631531, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.035, + "step": 15680 + }, + { + "epoch": 0.4486061472480343, + "grad_norm": 0.383831262588501, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0416, + "step": 15690 + }, + { + "epoch": 0.44889206576125806, + "grad_norm": 1.9365276098251343, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0498, + "step": 15700 + }, + { + "epoch": 0.44917798427448175, + "grad_norm": 0.6964924931526184, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.034, + "step": 15710 + }, + { + "epoch": 0.4494639027877055, + "grad_norm": 0.5148108601570129, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0401, + "step": 15720 + }, + { + "epoch": 0.4497498213009292, + "grad_norm": 0.4529317617416382, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0361, + "step": 15730 + }, + { + "epoch": 0.45003573981415296, + "grad_norm": 0.6648512482643127, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0365, + "step": 15740 + }, + { + "epoch": 0.4503216583273767, + "grad_norm": 0.8183113932609558, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0416, + "step": 15750 + }, + { + "epoch": 0.45060757684060043, + "grad_norm": 0.8802638649940491, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0406, + "step": 15760 + }, + { + "epoch": 0.45089349535382417, + "grad_norm": 0.6329004764556885, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0395, + "step": 15770 + }, + { + "epoch": 0.4511794138670479, + "grad_norm": 0.35283520817756653, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0364, + "step": 15780 + }, + { + "epoch": 0.45146533238027164, + "grad_norm": 0.5156061053276062, + "learning_rate": 1.071827766589186e-05, + "loss": 0.031, + "step": 15790 + }, + { + "epoch": 0.4517512508934953, + "grad_norm": 0.37875205278396606, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0375, + "step": 15800 + }, + { + "epoch": 0.45203716940671906, + "grad_norm": 0.5543273687362671, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0421, + "step": 15810 + }, + { + "epoch": 0.4523230879199428, + "grad_norm": 0.3808431923389435, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 0.45260900643316654, + "grad_norm": 0.8648643493652344, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0396, + "step": 15830 + }, + { + "epoch": 0.4528949249463903, + "grad_norm": 0.7893536686897278, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0417, + "step": 15840 + }, + { + "epoch": 0.453180843459614, + "grad_norm": 0.904137134552002, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0384, + "step": 15850 + }, + { + "epoch": 0.45346676197283775, + "grad_norm": 0.6095889806747437, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0457, + "step": 15860 + }, + { + "epoch": 0.4537526804860615, + "grad_norm": 0.5691415667533875, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0438, + "step": 15870 + }, + { + "epoch": 0.4540385989992852, + "grad_norm": 0.37868618965148926, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0414, + "step": 15880 + }, + { + "epoch": 0.4543245175125089, + "grad_norm": 0.7962950468063354, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0405, + "step": 15890 + }, + { + "epoch": 0.45461043602573264, + "grad_norm": 0.8862378597259521, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0475, + "step": 15900 + }, + { + "epoch": 0.4548963545389564, + "grad_norm": 0.8762509822845459, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0472, + "step": 15910 + }, + { + "epoch": 0.4551822730521801, + "grad_norm": 0.6006313562393188, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0417, + "step": 15920 + }, + { + "epoch": 0.45546819156540386, + "grad_norm": 0.3340131938457489, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0374, + "step": 15930 + }, + { + "epoch": 0.4557541100786276, + "grad_norm": 0.2639552056789398, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0387, + "step": 15940 + }, + { + "epoch": 0.45604002859185133, + "grad_norm": 0.42564907670021057, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0376, + "step": 15950 + }, + { + "epoch": 0.45632594710507507, + "grad_norm": 0.503834068775177, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0344, + "step": 15960 + }, + { + "epoch": 0.4566118656182988, + "grad_norm": 0.5962334871292114, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0379, + "step": 15970 + }, + { + "epoch": 0.4568977841315225, + "grad_norm": 0.3271556794643402, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0361, + "step": 15980 + }, + { + "epoch": 0.4571837026447462, + "grad_norm": 0.5501612424850464, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0356, + "step": 15990 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 1.0399914979934692, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.039, + "step": 16000 + }, + { + "epoch": 0.4577555396711937, + "grad_norm": 0.42251288890838623, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0413, + "step": 16010 + }, + { + "epoch": 0.45804145818441744, + "grad_norm": 0.5694882869720459, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0501, + "step": 16020 + }, + { + "epoch": 0.4583273766976412, + "grad_norm": 0.37367814779281616, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0388, + "step": 16030 + }, + { + "epoch": 0.4586132952108649, + "grad_norm": 0.7947224974632263, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0324, + "step": 16040 + }, + { + "epoch": 0.45889921372408865, + "grad_norm": 0.47871798276901245, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0345, + "step": 16050 + }, + { + "epoch": 0.4591851322373124, + "grad_norm": 1.4443609714508057, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0502, + "step": 16060 + }, + { + "epoch": 0.45947105075053607, + "grad_norm": 0.8326191902160645, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0325, + "step": 16070 + }, + { + "epoch": 0.4597569692637598, + "grad_norm": 0.2887400686740875, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.035, + "step": 16080 + }, + { + "epoch": 0.46004288777698354, + "grad_norm": 0.34353405237197876, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0324, + "step": 16090 + }, + { + "epoch": 0.4603288062902073, + "grad_norm": 0.7319850325584412, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0307, + "step": 16100 + }, + { + "epoch": 0.460614724803431, + "grad_norm": 0.6628556847572327, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0398, + "step": 16110 + }, + { + "epoch": 0.46090064331665476, + "grad_norm": 0.39974722266197205, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.038, + "step": 16120 + }, + { + "epoch": 0.4611865618298785, + "grad_norm": 0.7769339680671692, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0425, + "step": 16130 + }, + { + "epoch": 0.46147248034310223, + "grad_norm": 0.6823691129684448, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.039, + "step": 16140 + }, + { + "epoch": 0.46175839885632597, + "grad_norm": 0.6749460697174072, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0388, + "step": 16150 + }, + { + "epoch": 0.46204431736954965, + "grad_norm": 1.0745635032653809, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0406, + "step": 16160 + }, + { + "epoch": 0.4623302358827734, + "grad_norm": 0.8388734459877014, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0345, + "step": 16170 + }, + { + "epoch": 0.4626161543959971, + "grad_norm": 0.675828218460083, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0355, + "step": 16180 + }, + { + "epoch": 0.46290207290922086, + "grad_norm": 0.9872504472732544, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0374, + "step": 16190 + }, + { + "epoch": 0.4631879914224446, + "grad_norm": 0.4705125689506531, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0416, + "step": 16200 + }, + { + "epoch": 0.46347390993566834, + "grad_norm": 0.43577539920806885, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.041, + "step": 16210 + }, + { + "epoch": 0.4637598284488921, + "grad_norm": 0.6472166180610657, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0372, + "step": 16220 + }, + { + "epoch": 0.4640457469621158, + "grad_norm": 1.0108906030654907, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0464, + "step": 16230 + }, + { + "epoch": 0.46433166547533955, + "grad_norm": 0.6221884489059448, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0396, + "step": 16240 + }, + { + "epoch": 0.46461758398856323, + "grad_norm": 0.7375202178955078, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0365, + "step": 16250 + }, + { + "epoch": 0.46490350250178697, + "grad_norm": 0.5090222358703613, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0404, + "step": 16260 + }, + { + "epoch": 0.4651894210150107, + "grad_norm": 0.5641722679138184, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0424, + "step": 16270 + }, + { + "epoch": 0.46547533952823444, + "grad_norm": 0.3946240246295929, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0433, + "step": 16280 + }, + { + "epoch": 0.4657612580414582, + "grad_norm": 0.525059700012207, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0399, + "step": 16290 + }, + { + "epoch": 0.4660471765546819, + "grad_norm": 0.6106441617012024, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0417, + "step": 16300 + }, + { + "epoch": 0.46633309506790566, + "grad_norm": 0.7064299583435059, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0331, + "step": 16310 + }, + { + "epoch": 0.4666190135811294, + "grad_norm": 0.6251654624938965, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0377, + "step": 16320 + }, + { + "epoch": 0.46690493209435313, + "grad_norm": 0.6626482009887695, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0355, + "step": 16330 + }, + { + "epoch": 0.4671908506075768, + "grad_norm": 0.32827794551849365, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0438, + "step": 16340 + }, + { + "epoch": 0.46747676912080055, + "grad_norm": 1.147644281387329, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.041, + "step": 16350 + }, + { + "epoch": 0.4677626876340243, + "grad_norm": 0.5785626769065857, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0362, + "step": 16360 + }, + { + "epoch": 0.468048606147248, + "grad_norm": 0.7087936401367188, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0364, + "step": 16370 + }, + { + "epoch": 0.46833452466047176, + "grad_norm": 0.7729533314704895, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0357, + "step": 16380 + }, + { + "epoch": 0.4686204431736955, + "grad_norm": 0.9080077409744263, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0445, + "step": 16390 + }, + { + "epoch": 0.46890636168691924, + "grad_norm": 0.5273067355155945, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0395, + "step": 16400 + }, + { + "epoch": 0.469192280200143, + "grad_norm": 0.4801991581916809, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0469, + "step": 16410 + }, + { + "epoch": 0.4694781987133667, + "grad_norm": 0.38060688972473145, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0377, + "step": 16420 + }, + { + "epoch": 0.4697641172265904, + "grad_norm": 1.335648536682129, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0444, + "step": 16430 + }, + { + "epoch": 0.47005003573981413, + "grad_norm": 0.6224690079689026, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0365, + "step": 16440 + }, + { + "epoch": 0.47033595425303787, + "grad_norm": 0.39938899874687195, + "learning_rate": 1.007637577910799e-05, + "loss": 0.037, + "step": 16450 + }, + { + "epoch": 0.4706218727662616, + "grad_norm": 0.47899872064590454, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0371, + "step": 16460 + }, + { + "epoch": 0.47090779127948534, + "grad_norm": 0.8991144895553589, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0337, + "step": 16470 + }, + { + "epoch": 0.4711937097927091, + "grad_norm": 0.6228598356246948, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0388, + "step": 16480 + }, + { + "epoch": 0.4714796283059328, + "grad_norm": 0.41108259558677673, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0378, + "step": 16490 + }, + { + "epoch": 0.47176554681915656, + "grad_norm": 0.722955048084259, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0381, + "step": 16500 + }, + { + "epoch": 0.4720514653323803, + "grad_norm": 0.6090973019599915, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0348, + "step": 16510 + }, + { + "epoch": 0.472337383845604, + "grad_norm": 0.483549565076828, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0456, + "step": 16520 + }, + { + "epoch": 0.4726233023588277, + "grad_norm": 0.4134727418422699, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0444, + "step": 16530 + }, + { + "epoch": 0.47290922087205145, + "grad_norm": 0.4629753530025482, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0382, + "step": 16540 + }, + { + "epoch": 0.4731951393852752, + "grad_norm": 0.8709504008293152, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0384, + "step": 16550 + }, + { + "epoch": 0.4734810578984989, + "grad_norm": 0.683397114276886, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0398, + "step": 16560 + }, + { + "epoch": 0.47376697641172266, + "grad_norm": 0.5743465423583984, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0431, + "step": 16570 + }, + { + "epoch": 0.4740528949249464, + "grad_norm": 1.0080480575561523, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0378, + "step": 16580 + }, + { + "epoch": 0.47433881343817014, + "grad_norm": 0.4668700098991394, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0369, + "step": 16590 + }, + { + "epoch": 0.4746247319513939, + "grad_norm": 0.6005896925926208, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0508, + "step": 16600 + }, + { + "epoch": 0.47491065046461756, + "grad_norm": 0.5788530707359314, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0354, + "step": 16610 + }, + { + "epoch": 0.4751965689778413, + "grad_norm": 0.38784441351890564, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0357, + "step": 16620 + }, + { + "epoch": 0.47548248749106503, + "grad_norm": 0.4809567928314209, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0331, + "step": 16630 + }, + { + "epoch": 0.47576840600428877, + "grad_norm": 0.6647809147834778, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0473, + "step": 16640 + }, + { + "epoch": 0.4760543245175125, + "grad_norm": 0.3968522548675537, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0304, + "step": 16650 + }, + { + "epoch": 0.47634024303073624, + "grad_norm": 0.3258526027202606, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0387, + "step": 16660 + }, + { + "epoch": 0.47662616154396, + "grad_norm": 0.43442079424858093, + "learning_rate": 9.863295834019308e-06, + "loss": 0.04, + "step": 16670 + }, + { + "epoch": 0.4769120800571837, + "grad_norm": 0.36909565329551697, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0351, + "step": 16680 + }, + { + "epoch": 0.47719799857040746, + "grad_norm": 0.5566768050193787, + "learning_rate": 9.843955128197274e-06, + "loss": 0.031, + "step": 16690 + }, + { + "epoch": 0.47748391708363114, + "grad_norm": 0.5705142617225647, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0359, + "step": 16700 + }, + { + "epoch": 0.4777698355968549, + "grad_norm": 0.28931716084480286, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0407, + "step": 16710 + }, + { + "epoch": 0.4780557541100786, + "grad_norm": 0.5509498715400696, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0363, + "step": 16720 + }, + { + "epoch": 0.47834167262330235, + "grad_norm": 0.3564346432685852, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0364, + "step": 16730 + }, + { + "epoch": 0.4786275911365261, + "grad_norm": 0.32734423875808716, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0369, + "step": 16740 + }, + { + "epoch": 0.4789135096497498, + "grad_norm": 0.3048594892024994, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0367, + "step": 16750 + }, + { + "epoch": 0.47919942816297356, + "grad_norm": 0.9007049798965454, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0377, + "step": 16760 + }, + { + "epoch": 0.4794853466761973, + "grad_norm": 0.7010983824729919, + "learning_rate": 9.76664747972605e-06, + "loss": 0.039, + "step": 16770 + }, + { + "epoch": 0.47977126518942104, + "grad_norm": 0.644473135471344, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0466, + "step": 16780 + }, + { + "epoch": 0.4800571837026447, + "grad_norm": 0.6333492398262024, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0373, + "step": 16790 + }, + { + "epoch": 0.48034310221586846, + "grad_norm": 0.5148355960845947, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0392, + "step": 16800 + }, + { + "epoch": 0.4806290207290922, + "grad_norm": 0.7288355231285095, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0381, + "step": 16810 + }, + { + "epoch": 0.48091493924231593, + "grad_norm": 0.3674873113632202, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0418, + "step": 16820 + }, + { + "epoch": 0.48120085775553967, + "grad_norm": 0.5055420398712158, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0336, + "step": 16830 + }, + { + "epoch": 0.4814867762687634, + "grad_norm": 0.641754686832428, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0342, + "step": 16840 + }, + { + "epoch": 0.48177269478198714, + "grad_norm": 0.308200478553772, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0364, + "step": 16850 + }, + { + "epoch": 0.4820586132952109, + "grad_norm": 0.41361021995544434, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0342, + "step": 16860 + }, + { + "epoch": 0.4823445318084346, + "grad_norm": 0.45777833461761475, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0353, + "step": 16870 + }, + { + "epoch": 0.4826304503216583, + "grad_norm": 0.7587664723396301, + "learning_rate": 9.660501900166734e-06, + "loss": 0.043, + "step": 16880 + }, + { + "epoch": 0.48291636883488204, + "grad_norm": 0.8740283250808716, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0372, + "step": 16890 + }, + { + "epoch": 0.4832022873481058, + "grad_norm": 0.3009270429611206, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0373, + "step": 16900 + }, + { + "epoch": 0.4834882058613295, + "grad_norm": 0.4439285695552826, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0349, + "step": 16910 + }, + { + "epoch": 0.48377412437455325, + "grad_norm": 0.39849671721458435, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0394, + "step": 16920 + }, + { + "epoch": 0.484060042887777, + "grad_norm": 0.6423043608665466, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0413, + "step": 16930 + }, + { + "epoch": 0.4843459614010007, + "grad_norm": 0.3683928847312927, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0387, + "step": 16940 + }, + { + "epoch": 0.48463187991422446, + "grad_norm": 0.7087769508361816, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0397, + "step": 16950 + }, + { + "epoch": 0.4849177984274482, + "grad_norm": 0.5348120927810669, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0405, + "step": 16960 + }, + { + "epoch": 0.4852037169406719, + "grad_norm": 0.549891471862793, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0363, + "step": 16970 + }, + { + "epoch": 0.4854896354538956, + "grad_norm": 0.7177272439002991, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0343, + "step": 16980 + }, + { + "epoch": 0.48577555396711936, + "grad_norm": 0.595417320728302, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0439, + "step": 16990 + }, + { + "epoch": 0.4860614724803431, + "grad_norm": 0.4838889241218567, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0387, + "step": 17000 + }, + { + "epoch": 0.48634739099356683, + "grad_norm": 0.6186223030090332, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0362, + "step": 17010 + }, + { + "epoch": 0.48663330950679057, + "grad_norm": 0.43383121490478516, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0381, + "step": 17020 + }, + { + "epoch": 0.4869192280200143, + "grad_norm": 0.6735527515411377, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0388, + "step": 17030 + }, + { + "epoch": 0.48720514653323804, + "grad_norm": 0.3746320605278015, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0491, + "step": 17040 + }, + { + "epoch": 0.4874910650464618, + "grad_norm": 0.29500988125801086, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0395, + "step": 17050 + }, + { + "epoch": 0.48777698355968546, + "grad_norm": 0.8518465757369995, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0435, + "step": 17060 + }, + { + "epoch": 0.4880629020729092, + "grad_norm": 0.9653190970420837, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0393, + "step": 17070 + }, + { + "epoch": 0.48834882058613294, + "grad_norm": 0.785724937915802, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0372, + "step": 17080 + }, + { + "epoch": 0.4886347390993567, + "grad_norm": 0.9450638890266418, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0406, + "step": 17090 + }, + { + "epoch": 0.4889206576125804, + "grad_norm": 0.645124077796936, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0361, + "step": 17100 + }, + { + "epoch": 0.48920657612580415, + "grad_norm": 0.3352372944355011, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0417, + "step": 17110 + }, + { + "epoch": 0.4894924946390279, + "grad_norm": 0.3858814835548401, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0345, + "step": 17120 + }, + { + "epoch": 0.4897784131522516, + "grad_norm": 0.5403604507446289, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0326, + "step": 17130 + }, + { + "epoch": 0.49006433166547536, + "grad_norm": 0.6986777782440186, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0417, + "step": 17140 + }, + { + "epoch": 0.49035025017869904, + "grad_norm": 0.5456675887107849, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0473, + "step": 17150 + }, + { + "epoch": 0.4906361686919228, + "grad_norm": 0.3961554765701294, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0341, + "step": 17160 + }, + { + "epoch": 0.4909220872051465, + "grad_norm": 0.5188277363777161, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0369, + "step": 17170 + }, + { + "epoch": 0.49120800571837026, + "grad_norm": 0.6042230725288391, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0352, + "step": 17180 + }, + { + "epoch": 0.491493924231594, + "grad_norm": 0.5485941171646118, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0405, + "step": 17190 + }, + { + "epoch": 0.49177984274481773, + "grad_norm": 0.5856509804725647, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0402, + "step": 17200 + }, + { + "epoch": 0.49206576125804147, + "grad_norm": 0.8656556010246277, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0349, + "step": 17210 + }, + { + "epoch": 0.4923516797712652, + "grad_norm": 0.4041757583618164, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0364, + "step": 17220 + }, + { + "epoch": 0.49263759828448894, + "grad_norm": 0.6135975122451782, + "learning_rate": 9.324104146177972e-06, + "loss": 0.036, + "step": 17230 + }, + { + "epoch": 0.4929235167977126, + "grad_norm": 0.5101860165596008, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0359, + "step": 17240 + }, + { + "epoch": 0.49320943531093636, + "grad_norm": 0.9913426041603088, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0552, + "step": 17250 + }, + { + "epoch": 0.4934953538241601, + "grad_norm": 0.6148158311843872, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0388, + "step": 17260 + }, + { + "epoch": 0.49378127233738384, + "grad_norm": 0.6651721596717834, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0374, + "step": 17270 + }, + { + "epoch": 0.4940671908506076, + "grad_norm": 0.9545061588287354, + "learning_rate": 9.276232738281744e-06, + "loss": 0.035, + "step": 17280 + }, + { + "epoch": 0.4943531093638313, + "grad_norm": 0.8923225402832031, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0366, + "step": 17290 + }, + { + "epoch": 0.49463902787705505, + "grad_norm": 0.5337848663330078, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0354, + "step": 17300 + }, + { + "epoch": 0.4949249463902788, + "grad_norm": 0.35039281845092773, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0341, + "step": 17310 + }, + { + "epoch": 0.4952108649035025, + "grad_norm": 0.47406911849975586, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0393, + "step": 17320 + }, + { + "epoch": 0.4954967834167262, + "grad_norm": 0.6226631999015808, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0375, + "step": 17330 + }, + { + "epoch": 0.49578270192994994, + "grad_norm": 0.6652712821960449, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0363, + "step": 17340 + }, + { + "epoch": 0.4960686204431737, + "grad_norm": 1.0042835474014282, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0368, + "step": 17350 + }, + { + "epoch": 0.4963545389563974, + "grad_norm": 0.4334045648574829, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0375, + "step": 17360 + }, + { + "epoch": 0.49664045746962115, + "grad_norm": 0.3561633229255676, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0347, + "step": 17370 + }, + { + "epoch": 0.4969263759828449, + "grad_norm": 0.5763550996780396, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0344, + "step": 17380 + }, + { + "epoch": 0.49721229449606863, + "grad_norm": 0.6306643486022949, + "learning_rate": 9.171095634265995e-06, + "loss": 0.037, + "step": 17390 + }, + { + "epoch": 0.49749821300929237, + "grad_norm": 0.4286569058895111, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0308, + "step": 17400 + }, + { + "epoch": 0.4977841315225161, + "grad_norm": 0.577983558177948, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0393, + "step": 17410 + }, + { + "epoch": 0.4980700500357398, + "grad_norm": 0.5714932084083557, + "learning_rate": 9.142466323573853e-06, + "loss": 0.038, + "step": 17420 + }, + { + "epoch": 0.4983559685489635, + "grad_norm": 0.7529498338699341, + "learning_rate": 9.132927564918328e-06, + "loss": 0.033, + "step": 17430 + }, + { + "epoch": 0.49864188706218726, + "grad_norm": 0.5179672241210938, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0367, + "step": 17440 + }, + { + "epoch": 0.498927805575411, + "grad_norm": 0.38424569368362427, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0401, + "step": 17450 + }, + { + "epoch": 0.49921372408863474, + "grad_norm": 0.469460129737854, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0379, + "step": 17460 + }, + { + "epoch": 0.4994996426018585, + "grad_norm": 0.3285387456417084, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0399, + "step": 17470 + }, + { + "epoch": 0.4997855611150822, + "grad_norm": 0.49863550066947937, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0313, + "step": 17480 + }, + { + "epoch": 0.5000714796283059, + "grad_norm": 0.3926186263561249, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0454, + "step": 17490 + }, + { + "epoch": 0.5003573981415297, + "grad_norm": 0.4476146399974823, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0472, + "step": 17500 + }, + { + "epoch": 0.5006433166547534, + "grad_norm": 0.5645599961280823, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0358, + "step": 17510 + }, + { + "epoch": 0.5009292351679772, + "grad_norm": 0.4813307225704193, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0364, + "step": 17520 + }, + { + "epoch": 0.5012151536812008, + "grad_norm": 0.49410971999168396, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0303, + "step": 17530 + }, + { + "epoch": 0.5015010721944246, + "grad_norm": 0.7172105312347412, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0404, + "step": 17540 + }, + { + "epoch": 0.5017869907076483, + "grad_norm": 0.43401873111724854, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0402, + "step": 17550 + }, + { + "epoch": 0.502072909220872, + "grad_norm": 0.6497406363487244, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0364, + "step": 17560 + }, + { + "epoch": 0.5023588277340958, + "grad_norm": 0.44618356227874756, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0337, + "step": 17570 + }, + { + "epoch": 0.5026447462473195, + "grad_norm": 0.4186992049217224, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0381, + "step": 17580 + }, + { + "epoch": 0.5029306647605433, + "grad_norm": 0.7387974858283997, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0319, + "step": 17590 + }, + { + "epoch": 0.503216583273767, + "grad_norm": 0.8068642020225525, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0373, + "step": 17600 + }, + { + "epoch": 0.5035025017869907, + "grad_norm": 0.5773473978042603, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0372, + "step": 17610 + }, + { + "epoch": 0.5037884203002144, + "grad_norm": 0.32488778233528137, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0334, + "step": 17620 + }, + { + "epoch": 0.5040743388134382, + "grad_norm": 0.33978500962257385, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0493, + "step": 17630 + }, + { + "epoch": 0.5043602573266619, + "grad_norm": 0.5897071361541748, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0335, + "step": 17640 + }, + { + "epoch": 0.5046461758398856, + "grad_norm": 0.6275895833969116, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0395, + "step": 17650 + }, + { + "epoch": 0.5049320943531094, + "grad_norm": 0.7995536923408508, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0422, + "step": 17660 + }, + { + "epoch": 0.505218012866333, + "grad_norm": 0.8734716773033142, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0414, + "step": 17670 + }, + { + "epoch": 0.5055039313795568, + "grad_norm": 0.6239343881607056, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0333, + "step": 17680 + }, + { + "epoch": 0.5057898498927805, + "grad_norm": 0.42508623003959656, + "learning_rate": 8.885721609997551e-06, + "loss": 0.045, + "step": 17690 + }, + { + "epoch": 0.5060757684060043, + "grad_norm": 0.4272485673427582, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0506, + "step": 17700 + }, + { + "epoch": 0.506361686919228, + "grad_norm": 0.8006368279457092, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0431, + "step": 17710 + }, + { + "epoch": 0.5066476054324518, + "grad_norm": 0.5896835327148438, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0322, + "step": 17720 + }, + { + "epoch": 0.5069335239456755, + "grad_norm": 0.6880389451980591, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0322, + "step": 17730 + }, + { + "epoch": 0.5072194424588992, + "grad_norm": 1.4850202798843384, + "learning_rate": 8.83836825410936e-06, + "loss": 0.052, + "step": 17740 + }, + { + "epoch": 0.507505360972123, + "grad_norm": 0.7684240937232971, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0353, + "step": 17750 + }, + { + "epoch": 0.5077912794853466, + "grad_norm": 0.5456307530403137, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0419, + "step": 17760 + }, + { + "epoch": 0.5080771979985704, + "grad_norm": 0.5775120258331299, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0366, + "step": 17770 + }, + { + "epoch": 0.5083631165117941, + "grad_norm": 0.6453070044517517, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0341, + "step": 17780 + }, + { + "epoch": 0.5086490350250179, + "grad_norm": 0.7906973361968994, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0405, + "step": 17790 + }, + { + "epoch": 0.5089349535382416, + "grad_norm": 1.0740606784820557, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0344, + "step": 17800 + }, + { + "epoch": 0.5092208720514654, + "grad_norm": 0.41854357719421387, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0334, + "step": 17810 + }, + { + "epoch": 0.5095067905646891, + "grad_norm": 0.6328964233398438, + "learning_rate": 8.762735374981932e-06, + "loss": 0.036, + "step": 17820 + }, + { + "epoch": 0.5097927090779127, + "grad_norm": 0.40875789523124695, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0338, + "step": 17830 + }, + { + "epoch": 0.5100786275911365, + "grad_norm": 0.5056312084197998, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0332, + "step": 17840 + }, + { + "epoch": 0.5103645461043602, + "grad_norm": 0.5005037784576416, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0416, + "step": 17850 + }, + { + "epoch": 0.510650464617584, + "grad_norm": 0.5689167380332947, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0329, + "step": 17860 + }, + { + "epoch": 0.5109363831308077, + "grad_norm": 0.5222717523574829, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0336, + "step": 17870 + }, + { + "epoch": 0.5112223016440315, + "grad_norm": 0.5998329520225525, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0354, + "step": 17880 + }, + { + "epoch": 0.5115082201572552, + "grad_norm": 0.4684480130672455, + "learning_rate": 8.69669425266315e-06, + "loss": 0.05, + "step": 17890 + }, + { + "epoch": 0.511794138670479, + "grad_norm": 0.4061124622821808, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0384, + "step": 17900 + }, + { + "epoch": 0.5120800571837026, + "grad_norm": 0.5025928020477295, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0386, + "step": 17910 + }, + { + "epoch": 0.5123659756969263, + "grad_norm": 0.3731222152709961, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0378, + "step": 17920 + }, + { + "epoch": 0.5126518942101501, + "grad_norm": 0.7784973978996277, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0419, + "step": 17930 + }, + { + "epoch": 0.5129378127233738, + "grad_norm": 0.7074074745178223, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0386, + "step": 17940 + }, + { + "epoch": 0.5132237312365976, + "grad_norm": 0.49802306294441223, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0418, + "step": 17950 + }, + { + "epoch": 0.5135096497498213, + "grad_norm": 0.4355427920818329, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0431, + "step": 17960 + }, + { + "epoch": 0.5137955682630451, + "grad_norm": 0.672635555267334, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0403, + "step": 17970 + }, + { + "epoch": 0.5140814867762687, + "grad_norm": 0.6733908653259277, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0487, + "step": 17980 + }, + { + "epoch": 0.5143674052894925, + "grad_norm": 0.43711504340171814, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0378, + "step": 17990 + }, + { + "epoch": 0.5146533238027162, + "grad_norm": 0.6371222138404846, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0336, + "step": 18000 + }, + { + "epoch": 0.5149392423159399, + "grad_norm": 0.8007041811943054, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0371, + "step": 18010 + }, + { + "epoch": 0.5152251608291637, + "grad_norm": 0.4725078344345093, + "learning_rate": 8.574400723012433e-06, + "loss": 0.037, + "step": 18020 + }, + { + "epoch": 0.5155110793423874, + "grad_norm": 0.34229791164398193, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0353, + "step": 18030 + }, + { + "epoch": 0.5157969978556112, + "grad_norm": 0.27863454818725586, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0371, + "step": 18040 + }, + { + "epoch": 0.5160829163688349, + "grad_norm": 0.43021920323371887, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0419, + "step": 18050 + }, + { + "epoch": 0.5163688348820586, + "grad_norm": 0.4683758318424225, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0307, + "step": 18060 + }, + { + "epoch": 0.5166547533952823, + "grad_norm": 0.29085367918014526, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0372, + "step": 18070 + }, + { + "epoch": 0.5169406719085061, + "grad_norm": 0.4396727681159973, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0328, + "step": 18080 + }, + { + "epoch": 0.5172265904217298, + "grad_norm": 0.539021372795105, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0317, + "step": 18090 + }, + { + "epoch": 0.5175125089349535, + "grad_norm": 0.556974470615387, + "learning_rate": 8.499380733111628e-06, + "loss": 0.037, + "step": 18100 + }, + { + "epoch": 0.5177984274481773, + "grad_norm": 0.4445747137069702, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0344, + "step": 18110 + }, + { + "epoch": 0.518084345961401, + "grad_norm": 0.3742713928222656, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0339, + "step": 18120 + }, + { + "epoch": 0.5183702644746248, + "grad_norm": 0.8467416167259216, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0409, + "step": 18130 + }, + { + "epoch": 0.5186561829878484, + "grad_norm": 0.7731484770774841, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0379, + "step": 18140 + }, + { + "epoch": 0.5189421015010722, + "grad_norm": 0.5664084553718567, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0353, + "step": 18150 + }, + { + "epoch": 0.5192280200142959, + "grad_norm": 0.5623966455459595, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0412, + "step": 18160 + }, + { + "epoch": 0.5195139385275197, + "grad_norm": 0.5074556469917297, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0402, + "step": 18170 + }, + { + "epoch": 0.5197998570407434, + "grad_norm": 0.49439728260040283, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0341, + "step": 18180 + }, + { + "epoch": 0.5200857755539671, + "grad_norm": 0.5982527136802673, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0308, + "step": 18190 + }, + { + "epoch": 0.5203716940671909, + "grad_norm": 0.7891598343849182, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0437, + "step": 18200 + }, + { + "epoch": 0.5206576125804145, + "grad_norm": 0.7565666437149048, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0381, + "step": 18210 + }, + { + "epoch": 0.5209435310936383, + "grad_norm": 0.33346351981163025, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0454, + "step": 18220 + }, + { + "epoch": 0.521229449606862, + "grad_norm": 0.5885659456253052, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0413, + "step": 18230 + }, + { + "epoch": 0.5215153681200858, + "grad_norm": 0.6487091183662415, + "learning_rate": 8.368551060444755e-06, + "loss": 0.035, + "step": 18240 + }, + { + "epoch": 0.5218012866333095, + "grad_norm": 0.9817430377006531, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0394, + "step": 18250 + }, + { + "epoch": 0.5220872051465333, + "grad_norm": 0.5691193342208862, + "learning_rate": 8.349909816537207e-06, + "loss": 0.041, + "step": 18260 + }, + { + "epoch": 0.522373123659757, + "grad_norm": 0.5326661467552185, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0361, + "step": 18270 + }, + { + "epoch": 0.5226590421729806, + "grad_norm": 0.5536142587661743, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0406, + "step": 18280 + }, + { + "epoch": 0.5229449606862044, + "grad_norm": 0.3482394218444824, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0423, + "step": 18290 + }, + { + "epoch": 0.5232308791994281, + "grad_norm": 0.514914333820343, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0352, + "step": 18300 + }, + { + "epoch": 0.5235167977126519, + "grad_norm": 0.7681404948234558, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0386, + "step": 18310 + }, + { + "epoch": 0.5238027162258756, + "grad_norm": 0.400426983833313, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0333, + "step": 18320 + }, + { + "epoch": 0.5240886347390994, + "grad_norm": 0.4996081590652466, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0381, + "step": 18330 + }, + { + "epoch": 0.5243745532523231, + "grad_norm": 0.5379085540771484, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0348, + "step": 18340 + }, + { + "epoch": 0.5246604717655469, + "grad_norm": 0.4462053179740906, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0307, + "step": 18350 + }, + { + "epoch": 0.5249463902787705, + "grad_norm": 0.7336096167564392, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0345, + "step": 18360 + }, + { + "epoch": 0.5252323087919942, + "grad_norm": 0.6676360368728638, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0346, + "step": 18370 + }, + { + "epoch": 0.525518227305218, + "grad_norm": 0.46608656644821167, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0334, + "step": 18380 + }, + { + "epoch": 0.5258041458184417, + "grad_norm": 0.4906940460205078, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0331, + "step": 18390 + }, + { + "epoch": 0.5260900643316655, + "grad_norm": 0.4200032353401184, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0394, + "step": 18400 + }, + { + "epoch": 0.5263759828448892, + "grad_norm": 0.5663877725601196, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0349, + "step": 18410 + }, + { + "epoch": 0.526661901358113, + "grad_norm": 0.36824384331703186, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0303, + "step": 18420 + }, + { + "epoch": 0.5269478198713367, + "grad_norm": 0.8120076060295105, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0443, + "step": 18430 + }, + { + "epoch": 0.5272337383845604, + "grad_norm": 0.4102472960948944, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0369, + "step": 18440 + }, + { + "epoch": 0.5275196568977841, + "grad_norm": 0.5186526775360107, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0338, + "step": 18450 + }, + { + "epoch": 0.5278055754110078, + "grad_norm": 0.9650108218193054, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0343, + "step": 18460 + }, + { + "epoch": 0.5280914939242316, + "grad_norm": 0.5894375443458557, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0416, + "step": 18470 + }, + { + "epoch": 0.5283774124374553, + "grad_norm": 0.6188816428184509, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0402, + "step": 18480 + }, + { + "epoch": 0.5286633309506791, + "grad_norm": 0.35280847549438477, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0363, + "step": 18490 + }, + { + "epoch": 0.5289492494639028, + "grad_norm": 0.7289313673973083, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0392, + "step": 18500 + }, + { + "epoch": 0.5292351679771266, + "grad_norm": 0.505050778388977, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0329, + "step": 18510 + }, + { + "epoch": 0.5295210864903502, + "grad_norm": 0.7029705047607422, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0344, + "step": 18520 + }, + { + "epoch": 0.529807005003574, + "grad_norm": 0.2958471477031708, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0431, + "step": 18530 + }, + { + "epoch": 0.5300929235167977, + "grad_norm": 0.9649683237075806, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0329, + "step": 18540 + }, + { + "epoch": 0.5303788420300214, + "grad_norm": 0.24733735620975494, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0354, + "step": 18550 + }, + { + "epoch": 0.5306647605432452, + "grad_norm": 0.44838136434555054, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0334, + "step": 18560 + }, + { + "epoch": 0.5309506790564689, + "grad_norm": 0.4505597949028015, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0338, + "step": 18570 + }, + { + "epoch": 0.5312365975696927, + "grad_norm": 0.44188442826271057, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0326, + "step": 18580 + }, + { + "epoch": 0.5315225160829163, + "grad_norm": 0.4539152979850769, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0369, + "step": 18590 + }, + { + "epoch": 0.5318084345961401, + "grad_norm": 0.8311023712158203, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0441, + "step": 18600 + }, + { + "epoch": 0.5320943531093638, + "grad_norm": 0.53764808177948, + "learning_rate": 8.025779439806006e-06, + "loss": 0.037, + "step": 18610 + }, + { + "epoch": 0.5323802716225876, + "grad_norm": 1.2192102670669556, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0369, + "step": 18620 + }, + { + "epoch": 0.5326661901358113, + "grad_norm": 0.5254611968994141, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0371, + "step": 18630 + }, + { + "epoch": 0.532952108649035, + "grad_norm": 0.585709810256958, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0337, + "step": 18640 + }, + { + "epoch": 0.5332380271622588, + "grad_norm": 0.45416259765625, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0425, + "step": 18650 + }, + { + "epoch": 0.5335239456754824, + "grad_norm": 0.3957739472389221, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0354, + "step": 18660 + }, + { + "epoch": 0.5338098641887062, + "grad_norm": 0.6211117506027222, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0347, + "step": 18670 + }, + { + "epoch": 0.5340957827019299, + "grad_norm": 0.49023327231407166, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0308, + "step": 18680 + }, + { + "epoch": 0.5343817012151537, + "grad_norm": 0.5823351144790649, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0351, + "step": 18690 + }, + { + "epoch": 0.5346676197283774, + "grad_norm": 0.6048677563667297, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0382, + "step": 18700 + }, + { + "epoch": 0.5349535382416012, + "grad_norm": 0.5293828845024109, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0329, + "step": 18710 + }, + { + "epoch": 0.5352394567548249, + "grad_norm": 0.5935509204864502, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0388, + "step": 18720 + }, + { + "epoch": 0.5355253752680486, + "grad_norm": 0.8369598388671875, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0421, + "step": 18730 + }, + { + "epoch": 0.5358112937812723, + "grad_norm": 0.6874870657920837, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0385, + "step": 18740 + }, + { + "epoch": 0.536097212294496, + "grad_norm": 0.43511492013931274, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0405, + "step": 18750 + }, + { + "epoch": 0.5363831308077198, + "grad_norm": 0.662755012512207, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0375, + "step": 18760 + }, + { + "epoch": 0.5366690493209435, + "grad_norm": 0.5519852638244629, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0351, + "step": 18770 + }, + { + "epoch": 0.5369549678341673, + "grad_norm": 0.9711637496948242, + "learning_rate": 7.869858673101027e-06, + "loss": 0.038, + "step": 18780 + }, + { + "epoch": 0.537240886347391, + "grad_norm": 0.4944411516189575, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0416, + "step": 18790 + }, + { + "epoch": 0.5375268048606148, + "grad_norm": 0.5257377624511719, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0349, + "step": 18800 + }, + { + "epoch": 0.5378127233738385, + "grad_norm": 0.4833063781261444, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0414, + "step": 18810 + }, + { + "epoch": 0.5380986418870621, + "grad_norm": 0.4496164917945862, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0369, + "step": 18820 + }, + { + "epoch": 0.5383845604002859, + "grad_norm": 0.6939138174057007, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0337, + "step": 18830 + }, + { + "epoch": 0.5386704789135096, + "grad_norm": 0.32579538226127625, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0371, + "step": 18840 + }, + { + "epoch": 0.5389563974267334, + "grad_norm": 0.35594654083251953, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0366, + "step": 18850 + }, + { + "epoch": 0.5392423159399571, + "grad_norm": 0.6114012002944946, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0372, + "step": 18860 + }, + { + "epoch": 0.5395282344531809, + "grad_norm": 0.8492457270622253, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0346, + "step": 18870 + }, + { + "epoch": 0.5398141529664046, + "grad_norm": 0.5214036703109741, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0388, + "step": 18880 + }, + { + "epoch": 0.5401000714796284, + "grad_norm": 0.428671658039093, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0394, + "step": 18890 + }, + { + "epoch": 0.540385989992852, + "grad_norm": 0.6071562767028809, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0371, + "step": 18900 + }, + { + "epoch": 0.5406719085060757, + "grad_norm": 0.41996505856513977, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0334, + "step": 18910 + }, + { + "epoch": 0.5409578270192995, + "grad_norm": 0.5260844826698303, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0362, + "step": 18920 + }, + { + "epoch": 0.5412437455325232, + "grad_norm": 0.43362122774124146, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0325, + "step": 18930 + }, + { + "epoch": 0.541529664045747, + "grad_norm": 0.4597149193286896, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0345, + "step": 18940 + }, + { + "epoch": 0.5418155825589707, + "grad_norm": 0.6667322516441345, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0421, + "step": 18950 + }, + { + "epoch": 0.5421015010721945, + "grad_norm": 0.8998900651931763, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0368, + "step": 18960 + }, + { + "epoch": 0.5423874195854181, + "grad_norm": 0.5075538158416748, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0327, + "step": 18970 + }, + { + "epoch": 0.5426733380986419, + "grad_norm": 0.38445526361465454, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0446, + "step": 18980 + }, + { + "epoch": 0.5429592566118656, + "grad_norm": 0.696186363697052, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0364, + "step": 18990 + }, + { + "epoch": 0.5432451751250893, + "grad_norm": 0.6371187567710876, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0401, + "step": 19000 + }, + { + "epoch": 0.5435310936383131, + "grad_norm": 0.6122881174087524, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0345, + "step": 19010 + }, + { + "epoch": 0.5438170121515368, + "grad_norm": 0.4222267270088196, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0456, + "step": 19020 + }, + { + "epoch": 0.5441029306647606, + "grad_norm": 0.6122517585754395, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0434, + "step": 19030 + }, + { + "epoch": 0.5443888491779842, + "grad_norm": 0.2783992886543274, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0354, + "step": 19040 + }, + { + "epoch": 0.544674767691208, + "grad_norm": 0.6433000564575195, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0321, + "step": 19050 + }, + { + "epoch": 0.5449606862044317, + "grad_norm": 0.6967030167579651, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0394, + "step": 19060 + }, + { + "epoch": 0.5452466047176555, + "grad_norm": 0.4799044132232666, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0329, + "step": 19070 + }, + { + "epoch": 0.5455325232308792, + "grad_norm": 0.633895993232727, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0316, + "step": 19080 + }, + { + "epoch": 0.5458184417441029, + "grad_norm": 0.5601945519447327, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0449, + "step": 19090 + }, + { + "epoch": 0.5461043602573267, + "grad_norm": 0.4917007088661194, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0351, + "step": 19100 + }, + { + "epoch": 0.5463902787705504, + "grad_norm": 0.4813363254070282, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.029, + "step": 19110 + }, + { + "epoch": 0.5466761972837741, + "grad_norm": 0.5359676480293274, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0346, + "step": 19120 + }, + { + "epoch": 0.5469621157969978, + "grad_norm": 0.6500958204269409, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0374, + "step": 19130 + }, + { + "epoch": 0.5472480343102216, + "grad_norm": 0.7708510756492615, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0332, + "step": 19140 + }, + { + "epoch": 0.5475339528234453, + "grad_norm": 0.45693230628967285, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0344, + "step": 19150 + }, + { + "epoch": 0.5478198713366691, + "grad_norm": 0.6046226620674133, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0342, + "step": 19160 + }, + { + "epoch": 0.5481057898498928, + "grad_norm": 0.5253175497055054, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0449, + "step": 19170 + }, + { + "epoch": 0.5483917083631165, + "grad_norm": 0.3790060877799988, + "learning_rate": 7.507267205473318e-06, + "loss": 0.037, + "step": 19180 + }, + { + "epoch": 0.5486776268763403, + "grad_norm": 0.37709203362464905, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0346, + "step": 19190 + }, + { + "epoch": 0.5489635453895639, + "grad_norm": 0.3940931558609009, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0427, + "step": 19200 + }, + { + "epoch": 0.5492494639027877, + "grad_norm": 0.761299192905426, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0353, + "step": 19210 + }, + { + "epoch": 0.5495353824160114, + "grad_norm": 0.5268495082855225, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0328, + "step": 19220 + }, + { + "epoch": 0.5498213009292352, + "grad_norm": 0.45624151825904846, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0353, + "step": 19230 + }, + { + "epoch": 0.5501072194424589, + "grad_norm": 0.5374972224235535, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0345, + "step": 19240 + }, + { + "epoch": 0.5503931379556827, + "grad_norm": 0.49830907583236694, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0328, + "step": 19250 + }, + { + "epoch": 0.5506790564689064, + "grad_norm": 0.6223296523094177, + "learning_rate": 7.435514206212475e-06, + "loss": 0.037, + "step": 19260 + }, + { + "epoch": 0.55096497498213, + "grad_norm": 0.42801398038864136, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0371, + "step": 19270 + }, + { + "epoch": 0.5512508934953538, + "grad_norm": 0.3872825801372528, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0314, + "step": 19280 + }, + { + "epoch": 0.5515368120085775, + "grad_norm": 0.3967494070529938, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0349, + "step": 19290 + }, + { + "epoch": 0.5518227305218013, + "grad_norm": 0.42383769154548645, + "learning_rate": 7.399737764864619e-06, + "loss": 0.045, + "step": 19300 + }, + { + "epoch": 0.552108649035025, + "grad_norm": 0.48501884937286377, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0373, + "step": 19310 + }, + { + "epoch": 0.5523945675482488, + "grad_norm": 0.3783693015575409, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0334, + "step": 19320 + }, + { + "epoch": 0.5526804860614725, + "grad_norm": 0.5733019709587097, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0369, + "step": 19330 + }, + { + "epoch": 0.5529664045746963, + "grad_norm": 0.5022825002670288, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0375, + "step": 19340 + }, + { + "epoch": 0.5532523230879199, + "grad_norm": 0.5508015155792236, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0415, + "step": 19350 + }, + { + "epoch": 0.5535382416011436, + "grad_norm": 0.5692425966262817, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0401, + "step": 19360 + }, + { + "epoch": 0.5538241601143674, + "grad_norm": 0.7247840762138367, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0373, + "step": 19370 + }, + { + "epoch": 0.5541100786275911, + "grad_norm": 0.633986234664917, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0435, + "step": 19380 + }, + { + "epoch": 0.5543959971408149, + "grad_norm": 0.8598711490631104, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0424, + "step": 19390 + }, + { + "epoch": 0.5546819156540386, + "grad_norm": 0.782328188419342, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0424, + "step": 19400 + }, + { + "epoch": 0.5549678341672624, + "grad_norm": 0.48890456557273865, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0351, + "step": 19410 + }, + { + "epoch": 0.555253752680486, + "grad_norm": 0.4759981036186218, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0395, + "step": 19420 + }, + { + "epoch": 0.5555396711937098, + "grad_norm": 0.6431323885917664, + "learning_rate": 7.283934675167239e-06, + "loss": 0.036, + "step": 19430 + }, + { + "epoch": 0.5558255897069335, + "grad_norm": 0.6633809208869934, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0333, + "step": 19440 + }, + { + "epoch": 0.5561115082201572, + "grad_norm": 0.3405994772911072, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0375, + "step": 19450 + }, + { + "epoch": 0.556397426733381, + "grad_norm": 0.3443987965583801, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0329, + "step": 19460 + }, + { + "epoch": 0.5566833452466047, + "grad_norm": 0.7973398566246033, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0412, + "step": 19470 + }, + { + "epoch": 0.5569692637598285, + "grad_norm": 0.43843239545822144, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0302, + "step": 19480 + }, + { + "epoch": 0.5572551822730522, + "grad_norm": 0.6797782182693481, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0401, + "step": 19490 + }, + { + "epoch": 0.557541100786276, + "grad_norm": 0.5020610690116882, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0329, + "step": 19500 + }, + { + "epoch": 0.5578270192994996, + "grad_norm": 0.5093050003051758, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0381, + "step": 19510 + }, + { + "epoch": 0.5581129378127234, + "grad_norm": 0.6136947870254517, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0373, + "step": 19520 + }, + { + "epoch": 0.5583988563259471, + "grad_norm": 0.4213317930698395, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0338, + "step": 19530 + }, + { + "epoch": 0.5586847748391708, + "grad_norm": 0.6560636162757874, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0332, + "step": 19540 + }, + { + "epoch": 0.5589706933523946, + "grad_norm": 0.41303765773773193, + "learning_rate": 7.177693135871202e-06, + "loss": 0.03, + "step": 19550 + }, + { + "epoch": 0.5592566118656183, + "grad_norm": 0.5260538458824158, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0328, + "step": 19560 + }, + { + "epoch": 0.559542530378842, + "grad_norm": 0.6076327562332153, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0428, + "step": 19570 + }, + { + "epoch": 0.5598284488920657, + "grad_norm": 0.635111927986145, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0318, + "step": 19580 + }, + { + "epoch": 0.5601143674052895, + "grad_norm": 0.7933056354522705, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0357, + "step": 19590 + }, + { + "epoch": 0.5604002859185132, + "grad_norm": 0.44312241673469543, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0289, + "step": 19600 + }, + { + "epoch": 0.560686204431737, + "grad_norm": 0.36346134543418884, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0354, + "step": 19610 + }, + { + "epoch": 0.5609721229449607, + "grad_norm": 0.49605289101600647, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0367, + "step": 19620 + }, + { + "epoch": 0.5612580414581844, + "grad_norm": 0.7115452289581299, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0354, + "step": 19630 + }, + { + "epoch": 0.5615439599714082, + "grad_norm": 0.650925874710083, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0353, + "step": 19640 + }, + { + "epoch": 0.5618298784846318, + "grad_norm": 0.5046663880348206, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0294, + "step": 19650 + }, + { + "epoch": 0.5621157969978556, + "grad_norm": 0.4441855549812317, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0326, + "step": 19660 + }, + { + "epoch": 0.5624017155110793, + "grad_norm": 0.3956650495529175, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0446, + "step": 19670 + }, + { + "epoch": 0.5626876340243031, + "grad_norm": 0.5384211540222168, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0331, + "step": 19680 + }, + { + "epoch": 0.5629735525375268, + "grad_norm": 0.6183366775512695, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0324, + "step": 19690 + }, + { + "epoch": 0.5632594710507506, + "grad_norm": 0.9116242527961731, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0341, + "step": 19700 + }, + { + "epoch": 0.5635453895639743, + "grad_norm": 0.8171015381813049, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0306, + "step": 19710 + }, + { + "epoch": 0.563831308077198, + "grad_norm": 0.42670243978500366, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0336, + "step": 19720 + }, + { + "epoch": 0.5641172265904217, + "grad_norm": 0.7338811159133911, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0363, + "step": 19730 + }, + { + "epoch": 0.5644031451036454, + "grad_norm": 0.5576338171958923, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0371, + "step": 19740 + }, + { + "epoch": 0.5646890636168692, + "grad_norm": 0.7390629649162292, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0382, + "step": 19750 + }, + { + "epoch": 0.5649749821300929, + "grad_norm": 0.801812469959259, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0379, + "step": 19760 + }, + { + "epoch": 0.5652609006433167, + "grad_norm": 0.5697385668754578, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0369, + "step": 19770 + }, + { + "epoch": 0.5655468191565404, + "grad_norm": 0.4180932343006134, + "learning_rate": 6.975884226362e-06, + "loss": 0.039, + "step": 19780 + }, + { + "epoch": 0.5658327376697642, + "grad_norm": 0.648389995098114, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0346, + "step": 19790 + }, + { + "epoch": 0.5661186561829878, + "grad_norm": 0.9673929214477539, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0392, + "step": 19800 + }, + { + "epoch": 0.5664045746962115, + "grad_norm": 0.4793975353240967, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0321, + "step": 19810 + }, + { + "epoch": 0.5666904932094353, + "grad_norm": 0.5206098556518555, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0319, + "step": 19820 + }, + { + "epoch": 0.566976411722659, + "grad_norm": 0.39929306507110596, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0335, + "step": 19830 + }, + { + "epoch": 0.5672623302358828, + "grad_norm": 0.6819440722465515, + "learning_rate": 6.923644220932124e-06, + "loss": 0.0338, + "step": 19840 + }, + { + "epoch": 0.5675482487491065, + "grad_norm": 0.7612042427062988, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0345, + "step": 19850 + }, + { + "epoch": 0.5678341672623303, + "grad_norm": 0.472676545381546, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0388, + "step": 19860 + }, + { + "epoch": 0.568120085775554, + "grad_norm": 0.48102107644081116, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0304, + "step": 19870 + }, + { + "epoch": 0.5684060042887777, + "grad_norm": 0.4174644649028778, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0315, + "step": 19880 + }, + { + "epoch": 0.5686919228020014, + "grad_norm": 0.4218151271343231, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0413, + "step": 19890 + }, + { + "epoch": 0.5689778413152251, + "grad_norm": 0.8243978023529053, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0399, + "step": 19900 + }, + { + "epoch": 0.5692637598284489, + "grad_norm": 0.400924414396286, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0298, + "step": 19910 + }, + { + "epoch": 0.5695496783416726, + "grad_norm": 0.5199277400970459, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0351, + "step": 19920 + }, + { + "epoch": 0.5698355968548964, + "grad_norm": 0.5238781571388245, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0374, + "step": 19930 + }, + { + "epoch": 0.5701215153681201, + "grad_norm": 0.7451756596565247, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0378, + "step": 19940 + }, + { + "epoch": 0.5704074338813439, + "grad_norm": 0.5029926300048828, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0391, + "step": 19950 + }, + { + "epoch": 0.5706933523945675, + "grad_norm": 0.5532147884368896, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0383, + "step": 19960 + }, + { + "epoch": 0.5709792709077913, + "grad_norm": 0.5694131851196289, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0364, + "step": 19970 + }, + { + "epoch": 0.571265189421015, + "grad_norm": 0.5066515803337097, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0363, + "step": 19980 + }, + { + "epoch": 0.5715511079342387, + "grad_norm": 0.5676470398902893, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0327, + "step": 19990 + }, + { + "epoch": 0.5718370264474625, + "grad_norm": 0.37414318323135376, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0395, + "step": 20000 + }, + { + "epoch": 0.5721229449606862, + "grad_norm": 0.5888793468475342, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0372, + "step": 20010 + }, + { + "epoch": 0.57240886347391, + "grad_norm": 0.6593262553215027, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0329, + "step": 20020 + }, + { + "epoch": 0.5726947819871336, + "grad_norm": 0.6382879614830017, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0286, + "step": 20030 + }, + { + "epoch": 0.5729807005003574, + "grad_norm": 0.6364927887916565, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0383, + "step": 20040 + }, + { + "epoch": 0.5732666190135811, + "grad_norm": 0.4102194011211395, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0342, + "step": 20050 + }, + { + "epoch": 0.5735525375268049, + "grad_norm": 0.6449235081672668, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0315, + "step": 20060 + }, + { + "epoch": 0.5738384560400286, + "grad_norm": 0.708431601524353, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0316, + "step": 20070 + }, + { + "epoch": 0.5741243745532523, + "grad_norm": 0.46444272994995117, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0352, + "step": 20080 + }, + { + "epoch": 0.5744102930664761, + "grad_norm": 0.7026715278625488, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0337, + "step": 20090 + }, + { + "epoch": 0.5746962115796997, + "grad_norm": 0.43397894501686096, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0303, + "step": 20100 + }, + { + "epoch": 0.5749821300929235, + "grad_norm": 0.4937734305858612, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0403, + "step": 20110 + }, + { + "epoch": 0.5752680486061472, + "grad_norm": 0.5981410145759583, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0375, + "step": 20120 + }, + { + "epoch": 0.575553967119371, + "grad_norm": 0.5616198778152466, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0314, + "step": 20130 + }, + { + "epoch": 0.5758398856325947, + "grad_norm": 0.35028502345085144, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0367, + "step": 20140 + }, + { + "epoch": 0.5761258041458185, + "grad_norm": 0.3556109666824341, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0308, + "step": 20150 + }, + { + "epoch": 0.5764117226590422, + "grad_norm": 0.579409658908844, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0344, + "step": 20160 + }, + { + "epoch": 0.5766976411722659, + "grad_norm": 0.4484683573246002, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0312, + "step": 20170 + }, + { + "epoch": 0.5769835596854896, + "grad_norm": 0.3636038899421692, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0337, + "step": 20180 + }, + { + "epoch": 0.5772694781987133, + "grad_norm": 0.6667287349700928, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0343, + "step": 20190 + }, + { + "epoch": 0.5775553967119371, + "grad_norm": 0.26031574606895447, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0303, + "step": 20200 + }, + { + "epoch": 0.5778413152251608, + "grad_norm": 0.6683355569839478, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0316, + "step": 20210 + }, + { + "epoch": 0.5781272337383846, + "grad_norm": 0.4097786843776703, + "learning_rate": 6.596880604028027e-06, + "loss": 0.0346, + "step": 20220 + }, + { + "epoch": 0.5784131522516083, + "grad_norm": 0.45405757427215576, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0345, + "step": 20230 + }, + { + "epoch": 0.5786990707648321, + "grad_norm": 0.28291839361190796, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0323, + "step": 20240 + }, + { + "epoch": 0.5789849892780558, + "grad_norm": 0.5656186938285828, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0299, + "step": 20250 + }, + { + "epoch": 0.5792709077912794, + "grad_norm": 0.6780310869216919, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0309, + "step": 20260 + }, + { + "epoch": 0.5795568263045032, + "grad_norm": 0.3968813121318817, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0347, + "step": 20270 + }, + { + "epoch": 0.5798427448177269, + "grad_norm": 0.6598440408706665, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0329, + "step": 20280 + }, + { + "epoch": 0.5801286633309507, + "grad_norm": 0.4988970458507538, + "learning_rate": 6.53748481975927e-06, + "loss": 0.038, + "step": 20290 + }, + { + "epoch": 0.5804145818441744, + "grad_norm": 0.8016706705093384, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0358, + "step": 20300 + }, + { + "epoch": 0.5807005003573982, + "grad_norm": 0.8367684483528137, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0354, + "step": 20310 + }, + { + "epoch": 0.5809864188706219, + "grad_norm": 0.5730129480361938, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0421, + "step": 20320 + }, + { + "epoch": 0.5812723373838456, + "grad_norm": 0.43631577491760254, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0361, + "step": 20330 + }, + { + "epoch": 0.5815582558970693, + "grad_norm": 0.7001264691352844, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0355, + "step": 20340 + }, + { + "epoch": 0.581844174410293, + "grad_norm": 0.4988951086997986, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0387, + "step": 20350 + }, + { + "epoch": 0.5821300929235168, + "grad_norm": 0.45731016993522644, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0398, + "step": 20360 + }, + { + "epoch": 0.5824160114367405, + "grad_norm": 0.38684406876564026, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0345, + "step": 20370 + }, + { + "epoch": 0.5827019299499643, + "grad_norm": 0.3924580514431, + "learning_rate": 6.461496350649529e-06, + "loss": 0.037, + "step": 20380 + }, + { + "epoch": 0.582987848463188, + "grad_norm": 0.43735265731811523, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0371, + "step": 20390 + }, + { + "epoch": 0.5832737669764118, + "grad_norm": 0.4595138430595398, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0337, + "step": 20400 + }, + { + "epoch": 0.5835596854896354, + "grad_norm": 0.429569810628891, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0284, + "step": 20410 + }, + { + "epoch": 0.5838456040028592, + "grad_norm": 0.5399166345596313, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0353, + "step": 20420 + }, + { + "epoch": 0.5841315225160829, + "grad_norm": 0.5698734521865845, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0361, + "step": 20430 + }, + { + "epoch": 0.5844174410293066, + "grad_norm": 0.35422587394714355, + "learning_rate": 6.411076603575166e-06, + "loss": 0.033, + "step": 20440 + }, + { + "epoch": 0.5847033595425304, + "grad_norm": 0.4475875198841095, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0344, + "step": 20450 + }, + { + "epoch": 0.5849892780557541, + "grad_norm": 0.4950159192085266, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0428, + "step": 20460 + }, + { + "epoch": 0.5852751965689779, + "grad_norm": 0.695249617099762, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0354, + "step": 20470 + }, + { + "epoch": 0.5855611150822015, + "grad_norm": 0.2538593113422394, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0383, + "step": 20480 + }, + { + "epoch": 0.5858470335954253, + "grad_norm": 0.6770910024642944, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0364, + "step": 20490 + }, + { + "epoch": 0.586132952108649, + "grad_norm": 0.7187057733535767, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0319, + "step": 20500 + }, + { + "epoch": 0.5864188706218728, + "grad_norm": 0.34853193163871765, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.033, + "step": 20510 + }, + { + "epoch": 0.5867047891350965, + "grad_norm": 0.8484768271446228, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0331, + "step": 20520 + }, + { + "epoch": 0.5869907076483202, + "grad_norm": 0.6645244359970093, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0353, + "step": 20530 + }, + { + "epoch": 0.587276626161544, + "grad_norm": 0.5094996690750122, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0374, + "step": 20540 + }, + { + "epoch": 0.5875625446747677, + "grad_norm": 0.5012859106063843, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0329, + "step": 20550 + }, + { + "epoch": 0.5878484631879914, + "grad_norm": 0.6465861797332764, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0282, + "step": 20560 + }, + { + "epoch": 0.5881343817012151, + "grad_norm": 0.5694834589958191, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0313, + "step": 20570 + }, + { + "epoch": 0.5884203002144389, + "grad_norm": 0.4945555627346039, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0353, + "step": 20580 + }, + { + "epoch": 0.5887062187276626, + "grad_norm": 0.5606586933135986, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0343, + "step": 20590 + }, + { + "epoch": 0.5889921372408864, + "grad_norm": 0.6913802027702332, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0358, + "step": 20600 + }, + { + "epoch": 0.5892780557541101, + "grad_norm": 0.8119901418685913, + "learning_rate": 6.269280523549298e-06, + "loss": 0.038, + "step": 20610 + }, + { + "epoch": 0.5895639742673338, + "grad_norm": 0.5558752417564392, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0311, + "step": 20620 + }, + { + "epoch": 0.5898498927805575, + "grad_norm": 0.45028987526893616, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0321, + "step": 20630 + }, + { + "epoch": 0.5901358112937812, + "grad_norm": 0.3697125017642975, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0331, + "step": 20640 + }, + { + "epoch": 0.590421729807005, + "grad_norm": 0.5406038761138916, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0445, + "step": 20650 + }, + { + "epoch": 0.5907076483202287, + "grad_norm": 0.4301048219203949, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0371, + "step": 20660 + }, + { + "epoch": 0.5909935668334525, + "grad_norm": 0.6343403458595276, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0353, + "step": 20670 + }, + { + "epoch": 0.5912794853466762, + "grad_norm": 0.4666310250759125, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0352, + "step": 20680 + }, + { + "epoch": 0.5915654038599, + "grad_norm": 0.7471063733100891, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0352, + "step": 20690 + }, + { + "epoch": 0.5918513223731237, + "grad_norm": 0.9971692562103271, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0327, + "step": 20700 + }, + { + "epoch": 0.5921372408863473, + "grad_norm": 0.5646237134933472, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0365, + "step": 20710 + }, + { + "epoch": 0.5924231593995711, + "grad_norm": 0.46781328320503235, + "learning_rate": 6.17838207381795e-06, + "loss": 0.042, + "step": 20720 + }, + { + "epoch": 0.5927090779127948, + "grad_norm": 0.7061547040939331, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0484, + "step": 20730 + }, + { + "epoch": 0.5929949964260186, + "grad_norm": 0.6651175618171692, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0353, + "step": 20740 + }, + { + "epoch": 0.5932809149392423, + "grad_norm": 0.5959596037864685, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0344, + "step": 20750 + }, + { + "epoch": 0.5935668334524661, + "grad_norm": 0.5869056582450867, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0389, + "step": 20760 + }, + { + "epoch": 0.5938527519656898, + "grad_norm": 0.42101356387138367, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0288, + "step": 20770 + }, + { + "epoch": 0.5941386704789136, + "grad_norm": 0.6310023069381714, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0362, + "step": 20780 + }, + { + "epoch": 0.5944245889921372, + "grad_norm": 0.6737013459205627, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0377, + "step": 20790 + }, + { + "epoch": 0.5947105075053609, + "grad_norm": 0.6716046333312988, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0415, + "step": 20800 + }, + { + "epoch": 0.5949964260185847, + "grad_norm": 0.9742669463157654, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0337, + "step": 20810 + }, + { + "epoch": 0.5952823445318084, + "grad_norm": 0.571782648563385, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0362, + "step": 20820 + }, + { + "epoch": 0.5955682630450322, + "grad_norm": 0.9673911333084106, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0362, + "step": 20830 + }, + { + "epoch": 0.5958541815582559, + "grad_norm": 0.5391695499420166, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0331, + "step": 20840 + }, + { + "epoch": 0.5961401000714797, + "grad_norm": 1.4766349792480469, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0332, + "step": 20850 + }, + { + "epoch": 0.5964260185847033, + "grad_norm": 0.6329004168510437, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0375, + "step": 20860 + }, + { + "epoch": 0.5967119370979271, + "grad_norm": 0.6745501160621643, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0347, + "step": 20870 + }, + { + "epoch": 0.5969978556111508, + "grad_norm": 0.3006536364555359, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0321, + "step": 20880 + }, + { + "epoch": 0.5972837741243745, + "grad_norm": 0.4666125476360321, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0363, + "step": 20890 + }, + { + "epoch": 0.5975696926375983, + "grad_norm": 0.3881456255912781, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0318, + "step": 20900 + }, + { + "epoch": 0.597855611150822, + "grad_norm": 0.4211449921131134, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0357, + "step": 20910 + }, + { + "epoch": 0.5981415296640458, + "grad_norm": 1.125683307647705, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0364, + "step": 20920 + }, + { + "epoch": 0.5984274481772694, + "grad_norm": 0.9670853614807129, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0385, + "step": 20930 + }, + { + "epoch": 0.5987133666904932, + "grad_norm": 0.7302138209342957, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0321, + "step": 20940 + }, + { + "epoch": 0.5989992852037169, + "grad_norm": 0.7883613109588623, + "learning_rate": 5.990549152010853e-06, + "loss": 0.038, + "step": 20950 + }, + { + "epoch": 0.5992852037169407, + "grad_norm": 0.44051188230514526, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0423, + "step": 20960 + }, + { + "epoch": 0.5995711222301644, + "grad_norm": 0.5225116014480591, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0293, + "step": 20970 + }, + { + "epoch": 0.5998570407433881, + "grad_norm": 0.44672495126724243, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0314, + "step": 20980 + }, + { + "epoch": 0.6001429592566119, + "grad_norm": 0.4489240050315857, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0298, + "step": 20990 + }, + { + "epoch": 0.6004288777698356, + "grad_norm": 0.3942757844924927, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0323, + "step": 21000 + }, + { + "epoch": 0.6007147962830593, + "grad_norm": 0.5079668760299683, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0435, + "step": 21010 + }, + { + "epoch": 0.601000714796283, + "grad_norm": 0.5057359933853149, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0364, + "step": 21020 + }, + { + "epoch": 0.6012866333095068, + "grad_norm": 0.4823545515537262, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0408, + "step": 21030 + }, + { + "epoch": 0.6015725518227305, + "grad_norm": 0.42647498846054077, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0366, + "step": 21040 + }, + { + "epoch": 0.6018584703359543, + "grad_norm": 0.5967830419540405, + "learning_rate": 5.909845843697164e-06, + "loss": 0.037, + "step": 21050 + }, + { + "epoch": 0.602144388849178, + "grad_norm": 0.4567292034626007, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0306, + "step": 21060 + }, + { + "epoch": 0.6024303073624017, + "grad_norm": 0.6767273545265198, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0447, + "step": 21070 + }, + { + "epoch": 0.6027162258756255, + "grad_norm": 0.2957002520561218, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0339, + "step": 21080 + }, + { + "epoch": 0.6030021443888491, + "grad_norm": 0.6870969533920288, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0313, + "step": 21090 + }, + { + "epoch": 0.6032880629020729, + "grad_norm": 0.530910313129425, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0377, + "step": 21100 + }, + { + "epoch": 0.6035739814152966, + "grad_norm": 0.21370625495910645, + "learning_rate": 5.86170998451151e-06, + "loss": 0.032, + "step": 21110 + }, + { + "epoch": 0.6038598999285204, + "grad_norm": 0.6039503812789917, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0258, + "step": 21120 + }, + { + "epoch": 0.6041458184417441, + "grad_norm": 0.5375682711601257, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0341, + "step": 21130 + }, + { + "epoch": 0.6044317369549679, + "grad_norm": 0.4819096326828003, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0309, + "step": 21140 + }, + { + "epoch": 0.6047176554681916, + "grad_norm": 0.31165415048599243, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0278, + "step": 21150 + }, + { + "epoch": 0.6050035739814152, + "grad_norm": 0.2781001925468445, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0342, + "step": 21160 + }, + { + "epoch": 0.605289492494639, + "grad_norm": 0.44726037979125977, + "learning_rate": 5.813791207086085e-06, + "loss": 0.032, + "step": 21170 + }, + { + "epoch": 0.6055754110078627, + "grad_norm": 0.5762766599655151, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0325, + "step": 21180 + }, + { + "epoch": 0.6058613295210865, + "grad_norm": 0.49829939007759094, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0322, + "step": 21190 + }, + { + "epoch": 0.6061472480343102, + "grad_norm": 0.4683297276496887, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0369, + "step": 21200 + }, + { + "epoch": 0.606433166547534, + "grad_norm": 0.662159264087677, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0278, + "step": 21210 + }, + { + "epoch": 0.6067190850607577, + "grad_norm": 0.4397001564502716, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0366, + "step": 21220 + }, + { + "epoch": 0.6070050035739815, + "grad_norm": 0.4977007508277893, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0293, + "step": 21230 + }, + { + "epoch": 0.6072909220872051, + "grad_norm": 0.3705490827560425, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0315, + "step": 21240 + }, + { + "epoch": 0.6075768406004288, + "grad_norm": 0.6350240111351013, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0286, + "step": 21250 + }, + { + "epoch": 0.6078627591136526, + "grad_norm": 0.5590423941612244, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0373, + "step": 21260 + }, + { + "epoch": 0.6081486776268763, + "grad_norm": 0.5244049429893494, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0325, + "step": 21270 + }, + { + "epoch": 0.6084345961401001, + "grad_norm": 1.082044005393982, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0373, + "step": 21280 + }, + { + "epoch": 0.6087205146533238, + "grad_norm": 0.614028811454773, + "learning_rate": 5.71861298612245e-06, + "loss": 0.031, + "step": 21290 + }, + { + "epoch": 0.6090064331665476, + "grad_norm": 0.783205509185791, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0289, + "step": 21300 + }, + { + "epoch": 0.6092923516797712, + "grad_norm": 0.5420807600021362, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.031, + "step": 21310 + }, + { + "epoch": 0.609578270192995, + "grad_norm": 0.42979222536087036, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0291, + "step": 21320 + }, + { + "epoch": 0.6098641887062187, + "grad_norm": 0.44511356949806213, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.031, + "step": 21330 + }, + { + "epoch": 0.6101501072194424, + "grad_norm": 0.528799831867218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0269, + "step": 21340 + }, + { + "epoch": 0.6104360257326662, + "grad_norm": 0.43274471163749695, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0438, + "step": 21350 + }, + { + "epoch": 0.6107219442458899, + "grad_norm": 0.8020172715187073, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0393, + "step": 21360 + }, + { + "epoch": 0.6110078627591137, + "grad_norm": 0.4354296028614044, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0338, + "step": 21370 + }, + { + "epoch": 0.6112937812723374, + "grad_norm": 0.587364673614502, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0359, + "step": 21380 + }, + { + "epoch": 0.6115796997855611, + "grad_norm": 0.5426310300827026, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0333, + "step": 21390 + }, + { + "epoch": 0.6118656182987848, + "grad_norm": 0.5900459289550781, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0344, + "step": 21400 + }, + { + "epoch": 0.6121515368120086, + "grad_norm": 0.5652357935905457, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0396, + "step": 21410 + }, + { + "epoch": 0.6124374553252323, + "grad_norm": 0.5287114977836609, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0387, + "step": 21420 + }, + { + "epoch": 0.612723373838456, + "grad_norm": 0.7939184904098511, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0351, + "step": 21430 + }, + { + "epoch": 0.6130092923516798, + "grad_norm": 0.6840642094612122, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0349, + "step": 21440 + }, + { + "epoch": 0.6132952108649035, + "grad_norm": 0.3717428147792816, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0336, + "step": 21450 + }, + { + "epoch": 0.6135811293781273, + "grad_norm": 0.5073713064193726, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0326, + "step": 21460 + }, + { + "epoch": 0.6138670478913509, + "grad_norm": 1.1579232215881348, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0388, + "step": 21470 + }, + { + "epoch": 0.6141529664045747, + "grad_norm": 0.4209369122982025, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0307, + "step": 21480 + }, + { + "epoch": 0.6144388849177984, + "grad_norm": 0.38663822412490845, + "learning_rate": 5.561973825289734e-06, + "loss": 0.037, + "step": 21490 + }, + { + "epoch": 0.6147248034310222, + "grad_norm": 0.538270890712738, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0333, + "step": 21500 + }, + { + "epoch": 0.6150107219442459, + "grad_norm": 0.28280535340309143, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0319, + "step": 21510 + }, + { + "epoch": 0.6152966404574696, + "grad_norm": 0.5407803058624268, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0352, + "step": 21520 + }, + { + "epoch": 0.6155825589706934, + "grad_norm": 1.4600974321365356, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0409, + "step": 21530 + }, + { + "epoch": 0.615868477483917, + "grad_norm": 0.659900426864624, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0322, + "step": 21540 + }, + { + "epoch": 0.6161543959971408, + "grad_norm": 0.6401934623718262, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0339, + "step": 21550 + }, + { + "epoch": 0.6164403145103645, + "grad_norm": 0.6409866213798523, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0331, + "step": 21560 + }, + { + "epoch": 0.6167262330235883, + "grad_norm": 0.6627630591392517, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0332, + "step": 21570 + }, + { + "epoch": 0.617012151536812, + "grad_norm": 0.6180721521377563, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0327, + "step": 21580 + }, + { + "epoch": 0.6172980700500358, + "grad_norm": 0.4689866006374359, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0276, + "step": 21590 + }, + { + "epoch": 0.6175839885632595, + "grad_norm": 0.5039265751838684, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0427, + "step": 21600 + }, + { + "epoch": 0.6178699070764831, + "grad_norm": 0.5313833355903625, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0351, + "step": 21610 + }, + { + "epoch": 0.6181558255897069, + "grad_norm": 0.4919044077396393, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0327, + "step": 21620 + }, + { + "epoch": 0.6184417441029306, + "grad_norm": 0.5446444153785706, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0331, + "step": 21630 + }, + { + "epoch": 0.6187276626161544, + "grad_norm": 0.5198109745979309, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.032, + "step": 21640 + }, + { + "epoch": 0.6190135811293781, + "grad_norm": 0.5684625506401062, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0339, + "step": 21650 + }, + { + "epoch": 0.6192994996426019, + "grad_norm": 0.6882810592651367, + "learning_rate": 5.430834687545416e-06, + "loss": 0.035, + "step": 21660 + }, + { + "epoch": 0.6195854181558256, + "grad_norm": 0.7360101938247681, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0306, + "step": 21670 + }, + { + "epoch": 0.6198713366690494, + "grad_norm": 0.5557180047035217, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0241, + "step": 21680 + }, + { + "epoch": 0.620157255182273, + "grad_norm": 0.4302096962928772, + "learning_rate": 5.407887295494495e-06, + "loss": 0.035, + "step": 21690 + }, + { + "epoch": 0.6204431736954967, + "grad_norm": 0.4740016460418701, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0331, + "step": 21700 + }, + { + "epoch": 0.6207290922087205, + "grad_norm": 0.5400598049163818, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0297, + "step": 21710 + }, + { + "epoch": 0.6210150107219442, + "grad_norm": 0.4270641803741455, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0334, + "step": 21720 + }, + { + "epoch": 0.621300929235168, + "grad_norm": 0.41063550114631653, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0375, + "step": 21730 + }, + { + "epoch": 0.6215868477483917, + "grad_norm": 0.48556044697761536, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0291, + "step": 21740 + }, + { + "epoch": 0.6218727662616155, + "grad_norm": 0.2872731387615204, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0323, + "step": 21750 + }, + { + "epoch": 0.6221586847748392, + "grad_norm": 0.4088454246520996, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0307, + "step": 21760 + }, + { + "epoch": 0.622444603288063, + "grad_norm": 0.42600440979003906, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0326, + "step": 21770 + }, + { + "epoch": 0.6227305218012866, + "grad_norm": 0.36466315388679504, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0337, + "step": 21780 + }, + { + "epoch": 0.6230164403145103, + "grad_norm": 0.588921308517456, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0336, + "step": 21790 + }, + { + "epoch": 0.6233023588277341, + "grad_norm": 0.44768571853637695, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0326, + "step": 21800 + }, + { + "epoch": 0.6235882773409578, + "grad_norm": 1.1612637042999268, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0355, + "step": 21810 + }, + { + "epoch": 0.6238741958541816, + "grad_norm": 1.0912114381790161, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0436, + "step": 21820 + }, + { + "epoch": 0.6241601143674053, + "grad_norm": 0.5813164710998535, + "learning_rate": 5.301584321328435e-06, + "loss": 0.034, + "step": 21830 + }, + { + "epoch": 0.624446032880629, + "grad_norm": 0.45064911246299744, + "learning_rate": 5.294041118587667e-06, + "loss": 0.032, + "step": 21840 + }, + { + "epoch": 0.6247319513938527, + "grad_norm": 0.5173943638801575, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0322, + "step": 21850 + }, + { + "epoch": 0.6250178699070765, + "grad_norm": 0.41157352924346924, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0319, + "step": 21860 + }, + { + "epoch": 0.6253037884203002, + "grad_norm": 0.5711286067962646, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0288, + "step": 21870 + }, + { + "epoch": 0.6255897069335239, + "grad_norm": 0.5108116865158081, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0325, + "step": 21880 + }, + { + "epoch": 0.6258756254467477, + "grad_norm": 0.49562424421310425, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0306, + "step": 21890 + }, + { + "epoch": 0.6261615439599714, + "grad_norm": 0.3392108976840973, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0358, + "step": 21900 + }, + { + "epoch": 0.6264474624731952, + "grad_norm": 1.0588114261627197, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0404, + "step": 21910 + }, + { + "epoch": 0.6267333809864188, + "grad_norm": 0.6979959607124329, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0349, + "step": 21920 + }, + { + "epoch": 0.6270192994996426, + "grad_norm": 0.3185918927192688, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0276, + "step": 21930 + }, + { + "epoch": 0.6273052180128663, + "grad_norm": 0.3921501338481903, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0326, + "step": 21940 + }, + { + "epoch": 0.6275911365260901, + "grad_norm": 0.9666212797164917, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0346, + "step": 21950 + }, + { + "epoch": 0.6278770550393138, + "grad_norm": 0.4483211040496826, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0306, + "step": 21960 + }, + { + "epoch": 0.6281629735525375, + "grad_norm": 0.4839077293872833, + "learning_rate": 5.196592054173714e-06, + "loss": 0.026, + "step": 21970 + }, + { + "epoch": 0.6284488920657613, + "grad_norm": 0.5054528117179871, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0299, + "step": 21980 + }, + { + "epoch": 0.628734810578985, + "grad_norm": 0.5953076481819153, + "learning_rate": 5.181701567303612e-06, + "loss": 0.036, + "step": 21990 + }, + { + "epoch": 0.6290207290922087, + "grad_norm": 0.39300060272216797, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0358, + "step": 22000 + }, + { + "epoch": 0.6293066476054324, + "grad_norm": 0.42864665389060974, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0363, + "step": 22010 + }, + { + "epoch": 0.6295925661186562, + "grad_norm": 0.33609238266944885, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0398, + "step": 22020 + }, + { + "epoch": 0.6298784846318799, + "grad_norm": 0.4237107038497925, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0319, + "step": 22030 + }, + { + "epoch": 0.6301644031451037, + "grad_norm": 0.42774054408073425, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0332, + "step": 22040 + }, + { + "epoch": 0.6304503216583274, + "grad_norm": 0.8992825150489807, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0396, + "step": 22050 + }, + { + "epoch": 0.630736240171551, + "grad_norm": 0.20832861959934235, + "learning_rate": 5.129800405815733e-06, + "loss": 0.03, + "step": 22060 + }, + { + "epoch": 0.6310221586847748, + "grad_norm": 0.5961321592330933, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0429, + "step": 22070 + }, + { + "epoch": 0.6313080771979985, + "grad_norm": 0.5037736296653748, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0332, + "step": 22080 + }, + { + "epoch": 0.6315939957112223, + "grad_norm": 0.383732408285141, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0293, + "step": 22090 + }, + { + "epoch": 0.631879914224446, + "grad_norm": 0.8124368786811829, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0286, + "step": 22100 + }, + { + "epoch": 0.6321658327376698, + "grad_norm": 0.96833735704422, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0346, + "step": 22110 + }, + { + "epoch": 0.6324517512508935, + "grad_norm": 0.42382001876831055, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0345, + "step": 22120 + }, + { + "epoch": 0.6327376697641173, + "grad_norm": 0.5928776860237122, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0314, + "step": 22130 + }, + { + "epoch": 0.633023588277341, + "grad_norm": 0.7822670340538025, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0335, + "step": 22140 + }, + { + "epoch": 0.6333095067905646, + "grad_norm": 0.6383520364761353, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0321, + "step": 22150 + }, + { + "epoch": 0.6335954253037884, + "grad_norm": 0.3413240611553192, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0314, + "step": 22160 + }, + { + "epoch": 0.6338813438170121, + "grad_norm": 0.5960783958435059, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0385, + "step": 22170 + }, + { + "epoch": 0.6341672623302359, + "grad_norm": 0.2557702660560608, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0316, + "step": 22180 + }, + { + "epoch": 0.6344531808434596, + "grad_norm": 0.6229982376098633, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0325, + "step": 22190 + }, + { + "epoch": 0.6347390993566834, + "grad_norm": 0.5080077052116394, + "learning_rate": 5.027013727107874e-06, + "loss": 0.036, + "step": 22200 + }, + { + "epoch": 0.6350250178699071, + "grad_norm": 0.5630851984024048, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0279, + "step": 22210 + }, + { + "epoch": 0.6353109363831309, + "grad_norm": 0.81584233045578, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0343, + "step": 22220 + }, + { + "epoch": 0.6355968548963545, + "grad_norm": 0.3985321521759033, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0289, + "step": 22230 + }, + { + "epoch": 0.6358827734095782, + "grad_norm": 0.4481184482574463, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0345, + "step": 22240 + }, + { + "epoch": 0.636168691922802, + "grad_norm": 0.3640075623989105, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0307, + "step": 22250 + }, + { + "epoch": 0.6364546104360257, + "grad_norm": 0.4006771147251129, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0381, + "step": 22260 + }, + { + "epoch": 0.6367405289492495, + "grad_norm": 0.7638134360313416, + "learning_rate": 4.976134120528886e-06, + "loss": 0.039, + "step": 22270 + }, + { + "epoch": 0.6370264474624732, + "grad_norm": 0.4820837080478668, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0281, + "step": 22280 + }, + { + "epoch": 0.637312365975697, + "grad_norm": 0.5928444266319275, + "learning_rate": 4.961660586405147e-06, + "loss": 0.033, + "step": 22290 + }, + { + "epoch": 0.6375982844889206, + "grad_norm": 0.50687575340271, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0357, + "step": 22300 + }, + { + "epoch": 0.6378842030021444, + "grad_norm": 0.673939049243927, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0301, + "step": 22310 + }, + { + "epoch": 0.6381701215153681, + "grad_norm": 0.4300031065940857, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.029, + "step": 22320 + }, + { + "epoch": 0.6384560400285918, + "grad_norm": 0.6585102677345276, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0333, + "step": 22330 + }, + { + "epoch": 0.6387419585418156, + "grad_norm": 0.6430448889732361, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0345, + "step": 22340 + }, + { + "epoch": 0.6390278770550393, + "grad_norm": 0.8272712826728821, + "learning_rate": 4.918410326949594e-06, + "loss": 0.034, + "step": 22350 + }, + { + "epoch": 0.6393137955682631, + "grad_norm": 0.7631726861000061, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0321, + "step": 22360 + }, + { + "epoch": 0.6395997140814867, + "grad_norm": 0.5562252402305603, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0339, + "step": 22370 + }, + { + "epoch": 0.6398856325947105, + "grad_norm": 0.6027814149856567, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0355, + "step": 22380 + }, + { + "epoch": 0.6401715511079342, + "grad_norm": 0.3548984229564667, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0343, + "step": 22390 + }, + { + "epoch": 0.640457469621158, + "grad_norm": 0.4959709346294403, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.031, + "step": 22400 + }, + { + "epoch": 0.6407433881343817, + "grad_norm": 0.3765028715133667, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0406, + "step": 22410 + }, + { + "epoch": 0.6410293066476054, + "grad_norm": 0.5014662146568298, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0297, + "step": 22420 + }, + { + "epoch": 0.6413152251608292, + "grad_norm": 0.5085675716400146, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0324, + "step": 22430 + }, + { + "epoch": 0.6416011436740529, + "grad_norm": 0.37595826387405396, + "learning_rate": 4.854017257346105e-06, + "loss": 0.033, + "step": 22440 + }, + { + "epoch": 0.6418870621872766, + "grad_norm": 0.5408678650856018, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0323, + "step": 22450 + }, + { + "epoch": 0.6421729807005003, + "grad_norm": 0.4319652020931244, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0269, + "step": 22460 + }, + { + "epoch": 0.6424588992137241, + "grad_norm": 0.41388124227523804, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0311, + "step": 22470 + }, + { + "epoch": 0.6427448177269478, + "grad_norm": 0.4778555631637573, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0332, + "step": 22480 + }, + { + "epoch": 0.6430307362401716, + "grad_norm": 0.38835474848747253, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0304, + "step": 22490 + }, + { + "epoch": 0.6433166547533953, + "grad_norm": 0.5165611505508423, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0344, + "step": 22500 + }, + { + "epoch": 0.643602573266619, + "grad_norm": 0.4285198450088501, + "learning_rate": 4.804337352679613e-06, + "loss": 0.035, + "step": 22510 + }, + { + "epoch": 0.6438884917798428, + "grad_norm": 0.4512922167778015, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0354, + "step": 22520 + }, + { + "epoch": 0.6441744102930664, + "grad_norm": 0.33437663316726685, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0343, + "step": 22530 + }, + { + "epoch": 0.6444603288062902, + "grad_norm": 0.45291104912757874, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0307, + "step": 22540 + }, + { + "epoch": 0.6447462473195139, + "grad_norm": 0.5920093655586243, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0338, + "step": 22550 + }, + { + "epoch": 0.6450321658327377, + "grad_norm": 0.6362392902374268, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0335, + "step": 22560 + }, + { + "epoch": 0.6453180843459614, + "grad_norm": 0.28033652901649475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0267, + "step": 22570 + }, + { + "epoch": 0.6456040028591852, + "grad_norm": 0.4563148617744446, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0316, + "step": 22580 + }, + { + "epoch": 0.6458899213724089, + "grad_norm": 0.4889507591724396, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.034, + "step": 22590 + }, + { + "epoch": 0.6461758398856325, + "grad_norm": 0.6826061010360718, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0316, + "step": 22600 + }, + { + "epoch": 0.6464617583988563, + "grad_norm": 0.45066431164741516, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0287, + "step": 22610 + }, + { + "epoch": 0.64674767691208, + "grad_norm": 0.41994187235832214, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0312, + "step": 22620 + }, + { + "epoch": 0.6470335954253038, + "grad_norm": 0.39731675386428833, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0347, + "step": 22630 + }, + { + "epoch": 0.6473195139385275, + "grad_norm": 0.5207498073577881, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0304, + "step": 22640 + }, + { + "epoch": 0.6476054324517513, + "grad_norm": 0.42930668592453003, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0344, + "step": 22650 + }, + { + "epoch": 0.647891350964975, + "grad_norm": 0.3023674488067627, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0359, + "step": 22660 + }, + { + "epoch": 0.6481772694781988, + "grad_norm": 0.43205010890960693, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0323, + "step": 22670 + }, + { + "epoch": 0.6484631879914224, + "grad_norm": 0.5984707474708557, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0318, + "step": 22680 + }, + { + "epoch": 0.6487491065046461, + "grad_norm": 0.43477800488471985, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0346, + "step": 22690 + }, + { + "epoch": 0.6490350250178699, + "grad_norm": 0.3570900857448578, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0342, + "step": 22700 + }, + { + "epoch": 0.6493209435310936, + "grad_norm": 0.47367945313453674, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0367, + "step": 22710 + }, + { + "epoch": 0.6496068620443174, + "grad_norm": 0.3768099844455719, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0357, + "step": 22720 + }, + { + "epoch": 0.6498927805575411, + "grad_norm": 0.6188724040985107, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0299, + "step": 22730 + }, + { + "epoch": 0.6501786990707649, + "grad_norm": 0.5733038783073425, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0343, + "step": 22740 + }, + { + "epoch": 0.6504646175839885, + "grad_norm": 0.5000156164169312, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0284, + "step": 22750 + }, + { + "epoch": 0.6507505360972123, + "grad_norm": 0.22813546657562256, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0288, + "step": 22760 + }, + { + "epoch": 0.651036454610436, + "grad_norm": 0.4805088937282562, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0305, + "step": 22770 + }, + { + "epoch": 0.6513223731236597, + "grad_norm": 0.4652612507343292, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0304, + "step": 22780 + }, + { + "epoch": 0.6516082916368835, + "grad_norm": 0.5010579824447632, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0337, + "step": 22790 + }, + { + "epoch": 0.6518942101501072, + "grad_norm": 0.36260518431663513, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0284, + "step": 22800 + }, + { + "epoch": 0.652180128663331, + "grad_norm": 0.45098820328712463, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0297, + "step": 22810 + }, + { + "epoch": 0.6524660471765547, + "grad_norm": 0.6154504418373108, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0366, + "step": 22820 + }, + { + "epoch": 0.6527519656897784, + "grad_norm": 0.4522152543067932, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.033, + "step": 22830 + }, + { + "epoch": 0.6530378842030021, + "grad_norm": 0.34195253252983093, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0304, + "step": 22840 + }, + { + "epoch": 0.6533238027162259, + "grad_norm": 0.49787941575050354, + "learning_rate": 4.568154392147005e-06, + "loss": 0.033, + "step": 22850 + }, + { + "epoch": 0.6536097212294496, + "grad_norm": 0.5249335765838623, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0381, + "step": 22860 + }, + { + "epoch": 0.6538956397426733, + "grad_norm": 0.7645581960678101, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0298, + "step": 22870 + }, + { + "epoch": 0.6541815582558971, + "grad_norm": 0.6034232974052429, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0313, + "step": 22880 + }, + { + "epoch": 0.6544674767691208, + "grad_norm": 0.3499184846878052, + "learning_rate": 4.54093567906903e-06, + "loss": 0.036, + "step": 22890 + }, + { + "epoch": 0.6547533952823446, + "grad_norm": 0.4157135486602783, + "learning_rate": 4.534149931036931e-06, + "loss": 0.033, + "step": 22900 + }, + { + "epoch": 0.6550393137955682, + "grad_norm": 0.4563712775707245, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0361, + "step": 22910 + }, + { + "epoch": 0.655325232308792, + "grad_norm": 1.080802321434021, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0307, + "step": 22920 + }, + { + "epoch": 0.6556111508220157, + "grad_norm": 0.38259357213974, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0292, + "step": 22930 + }, + { + "epoch": 0.6558970693352395, + "grad_norm": 0.6920587420463562, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0322, + "step": 22940 + }, + { + "epoch": 0.6561829878484632, + "grad_norm": 0.628978967666626, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0391, + "step": 22950 + }, + { + "epoch": 0.6564689063616869, + "grad_norm": 0.4848436713218689, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0306, + "step": 22960 + }, + { + "epoch": 0.6567548248749107, + "grad_norm": 0.4478876292705536, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0334, + "step": 22970 + }, + { + "epoch": 0.6570407433881343, + "grad_norm": 0.47360673546791077, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0357, + "step": 22980 + }, + { + "epoch": 0.6573266619013581, + "grad_norm": 0.32840496301651, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0339, + "step": 22990 + }, + { + "epoch": 0.6576125804145818, + "grad_norm": 0.4047236442565918, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0321, + "step": 23000 + }, + { + "epoch": 0.6578984989278056, + "grad_norm": 0.7817053198814392, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0375, + "step": 23010 + }, + { + "epoch": 0.6581844174410293, + "grad_norm": 0.38985809683799744, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0343, + "step": 23020 + }, + { + "epoch": 0.6584703359542531, + "grad_norm": 0.45360830426216125, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0287, + "step": 23030 + }, + { + "epoch": 0.6587562544674768, + "grad_norm": 0.2886345088481903, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0322, + "step": 23040 + }, + { + "epoch": 0.6590421729807004, + "grad_norm": 0.8546258211135864, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0331, + "step": 23050 + }, + { + "epoch": 0.6593280914939242, + "grad_norm": 0.48426172137260437, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0343, + "step": 23060 + }, + { + "epoch": 0.6596140100071479, + "grad_norm": 0.46379074454307556, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0296, + "step": 23070 + }, + { + "epoch": 0.6598999285203717, + "grad_norm": 0.7772185206413269, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0319, + "step": 23080 + }, + { + "epoch": 0.6601858470335954, + "grad_norm": 0.4606277644634247, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0336, + "step": 23090 + }, + { + "epoch": 0.6604717655468192, + "grad_norm": 0.43342530727386475, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0287, + "step": 23100 + }, + { + "epoch": 0.6607576840600429, + "grad_norm": 0.385151207447052, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0363, + "step": 23110 + }, + { + "epoch": 0.6610436025732667, + "grad_norm": 0.3960207998752594, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0306, + "step": 23120 + }, + { + "epoch": 0.6613295210864903, + "grad_norm": 0.41210439801216125, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0348, + "step": 23130 + }, + { + "epoch": 0.661615439599714, + "grad_norm": 0.41976168751716614, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0272, + "step": 23140 + }, + { + "epoch": 0.6619013581129378, + "grad_norm": 0.3195948004722595, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0362, + "step": 23150 + }, + { + "epoch": 0.6621872766261615, + "grad_norm": 0.7024016380310059, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0316, + "step": 23160 + }, + { + "epoch": 0.6624731951393853, + "grad_norm": 0.2894183099269867, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0339, + "step": 23170 + }, + { + "epoch": 0.662759113652609, + "grad_norm": 0.489715576171875, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0272, + "step": 23180 + }, + { + "epoch": 0.6630450321658328, + "grad_norm": 0.3406641185283661, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0272, + "step": 23190 + }, + { + "epoch": 0.6633309506790565, + "grad_norm": 0.3647848963737488, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0337, + "step": 23200 + }, + { + "epoch": 0.6636168691922802, + "grad_norm": 0.7023333311080933, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0334, + "step": 23210 + }, + { + "epoch": 0.6639027877055039, + "grad_norm": 0.43989211320877075, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0313, + "step": 23220 + }, + { + "epoch": 0.6641887062187276, + "grad_norm": 0.7329099774360657, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0283, + "step": 23230 + }, + { + "epoch": 0.6644746247319514, + "grad_norm": 0.3954019546508789, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0321, + "step": 23240 + }, + { + "epoch": 0.6647605432451751, + "grad_norm": 0.38020703196525574, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0337, + "step": 23250 + }, + { + "epoch": 0.6650464617583989, + "grad_norm": 0.5988985300064087, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0353, + "step": 23260 + }, + { + "epoch": 0.6653323802716226, + "grad_norm": 0.4259869158267975, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0316, + "step": 23270 + }, + { + "epoch": 0.6656182987848464, + "grad_norm": 0.4322545528411865, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0318, + "step": 23280 + }, + { + "epoch": 0.66590421729807, + "grad_norm": 0.40275540947914124, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0344, + "step": 23290 + }, + { + "epoch": 0.6661901358112938, + "grad_norm": 0.5070827603340149, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0336, + "step": 23300 + }, + { + "epoch": 0.6664760543245175, + "grad_norm": 0.614973247051239, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0352, + "step": 23310 + }, + { + "epoch": 0.6667619728377412, + "grad_norm": 0.4637722074985504, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0277, + "step": 23320 + }, + { + "epoch": 0.667047891350965, + "grad_norm": 0.34951677918434143, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0284, + "step": 23330 + }, + { + "epoch": 0.6673338098641887, + "grad_norm": 0.5609407424926758, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0304, + "step": 23340 + }, + { + "epoch": 0.6676197283774125, + "grad_norm": 0.44585973024368286, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0263, + "step": 23350 + }, + { + "epoch": 0.6679056468906361, + "grad_norm": 0.5311269760131836, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0311, + "step": 23360 + }, + { + "epoch": 0.6681915654038599, + "grad_norm": 0.4923100471496582, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0277, + "step": 23370 + }, + { + "epoch": 0.6684774839170836, + "grad_norm": 0.5254819989204407, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0328, + "step": 23380 + }, + { + "epoch": 0.6687634024303074, + "grad_norm": 0.47537869215011597, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0373, + "step": 23390 + }, + { + "epoch": 0.6690493209435311, + "grad_norm": 0.40087464451789856, + "learning_rate": 4.204700678381975e-06, + "loss": 0.034, + "step": 23400 + }, + { + "epoch": 0.6693352394567548, + "grad_norm": 0.5166190266609192, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0314, + "step": 23410 + }, + { + "epoch": 0.6696211579699786, + "grad_norm": 0.42874693870544434, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0279, + "step": 23420 + }, + { + "epoch": 0.6699070764832022, + "grad_norm": 0.3685651123523712, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0313, + "step": 23430 + }, + { + "epoch": 0.670192994996426, + "grad_norm": 0.5417486429214478, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.033, + "step": 23440 + }, + { + "epoch": 0.6704789135096497, + "grad_norm": 0.5764726996421814, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0368, + "step": 23450 + }, + { + "epoch": 0.6707648320228735, + "grad_norm": 0.44168850779533386, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0258, + "step": 23460 + }, + { + "epoch": 0.6710507505360972, + "grad_norm": 0.39990919828414917, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0403, + "step": 23470 + }, + { + "epoch": 0.671336669049321, + "grad_norm": 0.7526253461837769, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0334, + "step": 23480 + }, + { + "epoch": 0.6716225875625447, + "grad_norm": 0.4888451397418976, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0314, + "step": 23490 + }, + { + "epoch": 0.6719085060757684, + "grad_norm": 0.5732892751693726, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0277, + "step": 23500 + }, + { + "epoch": 0.6721944245889921, + "grad_norm": 0.5806633830070496, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0347, + "step": 23510 + }, + { + "epoch": 0.6724803431022158, + "grad_norm": 0.4336501657962799, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0369, + "step": 23520 + }, + { + "epoch": 0.6727662616154396, + "grad_norm": 0.47082582116127014, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0408, + "step": 23530 + }, + { + "epoch": 0.6730521801286633, + "grad_norm": 0.6571422815322876, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0316, + "step": 23540 + }, + { + "epoch": 0.6733380986418871, + "grad_norm": 0.4899539649486542, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0256, + "step": 23550 + }, + { + "epoch": 0.6736240171551108, + "grad_norm": 0.3201868236064911, + "learning_rate": 4.103441847743051e-06, + "loss": 0.029, + "step": 23560 + }, + { + "epoch": 0.6739099356683346, + "grad_norm": 0.4385588765144348, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0284, + "step": 23570 + }, + { + "epoch": 0.6741958541815583, + "grad_norm": 0.5079174637794495, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0298, + "step": 23580 + }, + { + "epoch": 0.6744817726947819, + "grad_norm": 0.609523355960846, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0313, + "step": 23590 + }, + { + "epoch": 0.6747676912080057, + "grad_norm": 0.487690269947052, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0246, + "step": 23600 + }, + { + "epoch": 0.6750536097212294, + "grad_norm": 0.5146880745887756, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0319, + "step": 23610 + }, + { + "epoch": 0.6753395282344532, + "grad_norm": 0.5848239064216614, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0282, + "step": 23620 + }, + { + "epoch": 0.6756254467476769, + "grad_norm": 0.7779616117477417, + "learning_rate": 4.05979084812184e-06, + "loss": 0.033, + "step": 23630 + }, + { + "epoch": 0.6759113652609007, + "grad_norm": 0.3329331576824188, + "learning_rate": 4.053587511509546e-06, + "loss": 0.028, + "step": 23640 + }, + { + "epoch": 0.6761972837741244, + "grad_norm": 0.4691336154937744, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0313, + "step": 23650 + }, + { + "epoch": 0.6764832022873482, + "grad_norm": 0.47258421778678894, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0326, + "step": 23660 + }, + { + "epoch": 0.6767691208005718, + "grad_norm": 0.5333718657493591, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0332, + "step": 23670 + }, + { + "epoch": 0.6770550393137955, + "grad_norm": 0.7278451323509216, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0409, + "step": 23680 + }, + { + "epoch": 0.6773409578270193, + "grad_norm": 0.41567277908325195, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0263, + "step": 23690 + }, + { + "epoch": 0.677626876340243, + "grad_norm": 0.4351106584072113, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0276, + "step": 23700 + }, + { + "epoch": 0.6779127948534668, + "grad_norm": 0.31096217036247253, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0339, + "step": 23710 + }, + { + "epoch": 0.6781987133666905, + "grad_norm": 0.6321837306022644, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0313, + "step": 23720 + }, + { + "epoch": 0.6784846318799143, + "grad_norm": 0.5278098583221436, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0294, + "step": 23730 + }, + { + "epoch": 0.6787705503931379, + "grad_norm": 0.5778757333755493, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0324, + "step": 23740 + }, + { + "epoch": 0.6790564689063617, + "grad_norm": 0.6164223551750183, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0316, + "step": 23750 + }, + { + "epoch": 0.6793423874195854, + "grad_norm": 0.2872319221496582, + "learning_rate": 3.979785400791052e-06, + "loss": 0.034, + "step": 23760 + }, + { + "epoch": 0.6796283059328091, + "grad_norm": 0.6088704466819763, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0317, + "step": 23770 + }, + { + "epoch": 0.6799142244460329, + "grad_norm": 0.4733040928840637, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0282, + "step": 23780 + }, + { + "epoch": 0.6802001429592566, + "grad_norm": 1.3417131900787354, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0304, + "step": 23790 + }, + { + "epoch": 0.6804860614724804, + "grad_norm": 0.7316146492958069, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0311, + "step": 23800 + }, + { + "epoch": 0.680771979985704, + "grad_norm": 0.5726248025894165, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0323, + "step": 23810 + }, + { + "epoch": 0.6810578984989278, + "grad_norm": 0.3990941345691681, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0277, + "step": 23820 + }, + { + "epoch": 0.6813438170121515, + "grad_norm": 0.49237731099128723, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0287, + "step": 23830 + }, + { + "epoch": 0.6816297355253753, + "grad_norm": 0.47560542821884155, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0298, + "step": 23840 + }, + { + "epoch": 0.681915654038599, + "grad_norm": 0.5967867374420166, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0295, + "step": 23850 + }, + { + "epoch": 0.6822015725518227, + "grad_norm": 0.5726722478866577, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0283, + "step": 23860 + }, + { + "epoch": 0.6824874910650465, + "grad_norm": 0.282678484916687, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0303, + "step": 23870 + }, + { + "epoch": 0.6827734095782702, + "grad_norm": 0.4432118237018585, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0296, + "step": 23880 + }, + { + "epoch": 0.683059328091494, + "grad_norm": 0.33677008748054504, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0379, + "step": 23890 + }, + { + "epoch": 0.6833452466047176, + "grad_norm": 0.5063587427139282, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0281, + "step": 23900 + }, + { + "epoch": 0.6836311651179414, + "grad_norm": 0.2592383921146393, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0263, + "step": 23910 + }, + { + "epoch": 0.6839170836311651, + "grad_norm": 0.4482796788215637, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0289, + "step": 23920 + }, + { + "epoch": 0.6842030021443889, + "grad_norm": 0.2609167993068695, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0294, + "step": 23930 + }, + { + "epoch": 0.6844889206576126, + "grad_norm": 0.36982619762420654, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0306, + "step": 23940 + }, + { + "epoch": 0.6847748391708363, + "grad_norm": 0.47758495807647705, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0273, + "step": 23950 + }, + { + "epoch": 0.68506075768406, + "grad_norm": 0.5566948652267456, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0318, + "step": 23960 + }, + { + "epoch": 0.6853466761972837, + "grad_norm": 0.7815461754798889, + "learning_rate": 3.853493736024934e-06, + "loss": 0.03, + "step": 23970 + }, + { + "epoch": 0.6856325947105075, + "grad_norm": 0.42888402938842773, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0384, + "step": 23980 + }, + { + "epoch": 0.6859185132237312, + "grad_norm": 0.47878748178482056, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0356, + "step": 23990 + }, + { + "epoch": 0.686204431736955, + "grad_norm": 0.3847522735595703, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0272, + "step": 24000 + }, + { + "epoch": 0.6864903502501787, + "grad_norm": 0.7005330920219421, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0314, + "step": 24010 + }, + { + "epoch": 0.6867762687634025, + "grad_norm": 0.7769733667373657, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0306, + "step": 24020 + }, + { + "epoch": 0.6870621872766262, + "grad_norm": 0.4073965847492218, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0286, + "step": 24030 + }, + { + "epoch": 0.6873481057898498, + "grad_norm": 0.6220553517341614, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0358, + "step": 24040 + }, + { + "epoch": 0.6876340243030736, + "grad_norm": 0.32508641481399536, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0284, + "step": 24050 + }, + { + "epoch": 0.6879199428162973, + "grad_norm": 0.4828036427497864, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0308, + "step": 24060 + }, + { + "epoch": 0.6882058613295211, + "grad_norm": 0.4809496998786926, + "learning_rate": 3.794650811106129e-06, + "loss": 0.028, + "step": 24070 + }, + { + "epoch": 0.6884917798427448, + "grad_norm": 0.8497998714447021, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.037, + "step": 24080 + }, + { + "epoch": 0.6887776983559686, + "grad_norm": 0.758666455745697, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0323, + "step": 24090 + }, + { + "epoch": 0.6890636168691923, + "grad_norm": 0.40550050139427185, + "learning_rate": 3.777162510056721e-06, + "loss": 0.0359, + "step": 24100 + }, + { + "epoch": 0.6893495353824161, + "grad_norm": 0.4595869779586792, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0304, + "step": 24110 + }, + { + "epoch": 0.6896354538956397, + "grad_norm": 0.5098794102668762, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0279, + "step": 24120 + }, + { + "epoch": 0.6899213724088634, + "grad_norm": 0.3320889174938202, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0287, + "step": 24130 + }, + { + "epoch": 0.6902072909220872, + "grad_norm": 0.4708438515663147, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0289, + "step": 24140 + }, + { + "epoch": 0.6904932094353109, + "grad_norm": 1.0990219116210938, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0303, + "step": 24150 + }, + { + "epoch": 0.6907791279485347, + "grad_norm": 0.5109107494354248, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0306, + "step": 24160 + }, + { + "epoch": 0.6910650464617584, + "grad_norm": 0.6247434616088867, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0332, + "step": 24170 + }, + { + "epoch": 0.6913509649749822, + "grad_norm": 0.4033079743385315, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0252, + "step": 24180 + }, + { + "epoch": 0.6916368834882058, + "grad_norm": 0.36993420124053955, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0287, + "step": 24190 + }, + { + "epoch": 0.6919228020014296, + "grad_norm": 0.37320762872695923, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0364, + "step": 24200 + }, + { + "epoch": 0.6922087205146533, + "grad_norm": 0.6411201357841492, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0306, + "step": 24210 + }, + { + "epoch": 0.692494639027877, + "grad_norm": 0.7033433318138123, + "learning_rate": 3.707974016467e-06, + "loss": 0.0334, + "step": 24220 + }, + { + "epoch": 0.6927805575411008, + "grad_norm": 0.5307570695877075, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0338, + "step": 24230 + }, + { + "epoch": 0.6930664760543245, + "grad_norm": 0.6726395487785339, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0379, + "step": 24240 + }, + { + "epoch": 0.6933523945675483, + "grad_norm": 0.5609936714172363, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0272, + "step": 24250 + }, + { + "epoch": 0.693638313080772, + "grad_norm": 0.5961005687713623, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0361, + "step": 24260 + }, + { + "epoch": 0.6939242315939957, + "grad_norm": 0.46744176745414734, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0291, + "step": 24270 + }, + { + "epoch": 0.6942101501072194, + "grad_norm": 0.5180732607841492, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0377, + "step": 24280 + }, + { + "epoch": 0.6944960686204432, + "grad_norm": 0.594201922416687, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0312, + "step": 24290 + }, + { + "epoch": 0.6947819871336669, + "grad_norm": 0.5852509140968323, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0303, + "step": 24300 + }, + { + "epoch": 0.6950679056468906, + "grad_norm": 0.7885274291038513, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0329, + "step": 24310 + }, + { + "epoch": 0.6953538241601144, + "grad_norm": 0.5280163884162903, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.031, + "step": 24320 + }, + { + "epoch": 0.6956397426733381, + "grad_norm": 0.6047127842903137, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0283, + "step": 24330 + }, + { + "epoch": 0.6959256611865619, + "grad_norm": 0.43192219734191895, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0338, + "step": 24340 + }, + { + "epoch": 0.6962115796997855, + "grad_norm": 0.3320246636867523, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0262, + "step": 24350 + }, + { + "epoch": 0.6964974982130093, + "grad_norm": 0.46365252137184143, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0342, + "step": 24360 + }, + { + "epoch": 0.696783416726233, + "grad_norm": 0.537933886051178, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0286, + "step": 24370 + }, + { + "epoch": 0.6970693352394568, + "grad_norm": 0.3574221134185791, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0342, + "step": 24380 + }, + { + "epoch": 0.6973552537526805, + "grad_norm": 0.7051029205322266, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0316, + "step": 24390 + }, + { + "epoch": 0.6976411722659042, + "grad_norm": 0.587533712387085, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0302, + "step": 24400 + }, + { + "epoch": 0.697927090779128, + "grad_norm": 0.555778980255127, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0301, + "step": 24410 + }, + { + "epoch": 0.6982130092923516, + "grad_norm": 0.44060736894607544, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0296, + "step": 24420 + }, + { + "epoch": 0.6984989278055754, + "grad_norm": 0.3930843472480774, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0327, + "step": 24430 + }, + { + "epoch": 0.6987848463187991, + "grad_norm": 0.8878913521766663, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0389, + "step": 24440 + }, + { + "epoch": 0.6990707648320229, + "grad_norm": 0.45810988545417786, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0322, + "step": 24450 + }, + { + "epoch": 0.6993566833452466, + "grad_norm": 0.41808775067329407, + "learning_rate": 3.573305344104808e-06, + "loss": 0.032, + "step": 24460 + }, + { + "epoch": 0.6996426018584704, + "grad_norm": 0.5060444474220276, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0317, + "step": 24470 + }, + { + "epoch": 0.6999285203716941, + "grad_norm": 0.28741514682769775, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0271, + "step": 24480 + }, + { + "epoch": 0.7002144388849177, + "grad_norm": 0.5564437508583069, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0279, + "step": 24490 + }, + { + "epoch": 0.7005003573981415, + "grad_norm": 0.43762925267219543, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0317, + "step": 24500 + }, + { + "epoch": 0.7007862759113652, + "grad_norm": 0.46590355038642883, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0314, + "step": 24510 + }, + { + "epoch": 0.701072194424589, + "grad_norm": 0.640477180480957, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0258, + "step": 24520 + }, + { + "epoch": 0.7013581129378127, + "grad_norm": 0.5845742225646973, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0283, + "step": 24530 + }, + { + "epoch": 0.7016440314510365, + "grad_norm": 0.5625128746032715, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0381, + "step": 24540 + }, + { + "epoch": 0.7019299499642602, + "grad_norm": 0.4365232586860657, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0297, + "step": 24550 + }, + { + "epoch": 0.702215868477484, + "grad_norm": 0.5942055583000183, + "learning_rate": 3.518669865884119e-06, + "loss": 0.034, + "step": 24560 + }, + { + "epoch": 0.7025017869907076, + "grad_norm": 0.3847256302833557, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0293, + "step": 24570 + }, + { + "epoch": 0.7027877055039313, + "grad_norm": 0.542539119720459, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0327, + "step": 24580 + }, + { + "epoch": 0.7030736240171551, + "grad_norm": 0.5383610129356384, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0322, + "step": 24590 + }, + { + "epoch": 0.7033595425303788, + "grad_norm": 0.6085273027420044, + "learning_rate": 3.497061149826966e-06, + "loss": 0.0293, + "step": 24600 + }, + { + "epoch": 0.7036454610436026, + "grad_norm": 0.5107666254043579, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0266, + "step": 24610 + }, + { + "epoch": 0.7039313795568263, + "grad_norm": 0.4976873993873596, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0301, + "step": 24620 + }, + { + "epoch": 0.7042172980700501, + "grad_norm": 0.5735257863998413, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0264, + "step": 24630 + }, + { + "epoch": 0.7045032165832738, + "grad_norm": 0.6035013794898987, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0286, + "step": 24640 + }, + { + "epoch": 0.7047891350964975, + "grad_norm": 0.5665635466575623, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0345, + "step": 24650 + }, + { + "epoch": 0.7050750536097212, + "grad_norm": 0.5783578753471375, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0385, + "step": 24660 + }, + { + "epoch": 0.7053609721229449, + "grad_norm": 0.3957138657569885, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.0319, + "step": 24670 + }, + { + "epoch": 0.7056468906361687, + "grad_norm": 0.32982495427131653, + "learning_rate": 3.454266765790622e-06, + "loss": 0.034, + "step": 24680 + }, + { + "epoch": 0.7059328091493924, + "grad_norm": 0.5827629566192627, + "learning_rate": 3.448957251110008e-06, + "loss": 0.029, + "step": 24690 + }, + { + "epoch": 0.7062187276626162, + "grad_norm": 0.28891173005104065, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0328, + "step": 24700 + }, + { + "epoch": 0.7065046461758399, + "grad_norm": 0.7992371320724487, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0333, + "step": 24710 + }, + { + "epoch": 0.7067905646890636, + "grad_norm": 0.5976162552833557, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0327, + "step": 24720 + }, + { + "epoch": 0.7070764832022873, + "grad_norm": 0.4785068929195404, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0334, + "step": 24730 + }, + { + "epoch": 0.7073624017155111, + "grad_norm": 0.6561854481697083, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0317, + "step": 24740 + }, + { + "epoch": 0.7076483202287348, + "grad_norm": 0.6745696067810059, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0289, + "step": 24750 + }, + { + "epoch": 0.7079342387419585, + "grad_norm": 0.4914945960044861, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0306, + "step": 24760 + }, + { + "epoch": 0.7082201572551823, + "grad_norm": 0.35789182782173157, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0327, + "step": 24770 + }, + { + "epoch": 0.708506075768406, + "grad_norm": 0.416161447763443, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0313, + "step": 24780 + }, + { + "epoch": 0.7087919942816298, + "grad_norm": 0.6271718740463257, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0314, + "step": 24790 + }, + { + "epoch": 0.7090779127948534, + "grad_norm": 0.5230259895324707, + "learning_rate": 3.391138816571675e-06, + "loss": 0.037, + "step": 24800 + }, + { + "epoch": 0.7093638313080772, + "grad_norm": 0.54779452085495, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0364, + "step": 24810 + }, + { + "epoch": 0.7096497498213009, + "grad_norm": 0.6326698064804077, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0287, + "step": 24820 + }, + { + "epoch": 0.7099356683345247, + "grad_norm": 0.576437771320343, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0321, + "step": 24830 + }, + { + "epoch": 0.7102215868477484, + "grad_norm": 0.49094530940055847, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0328, + "step": 24840 + }, + { + "epoch": 0.7105075053609721, + "grad_norm": 3.1826400756835938, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0497, + "step": 24850 + }, + { + "epoch": 0.7107934238741959, + "grad_norm": 0.6048339009284973, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0429, + "step": 24860 + }, + { + "epoch": 0.7110793423874195, + "grad_norm": 0.6633393168449402, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0287, + "step": 24870 + }, + { + "epoch": 0.7113652609006433, + "grad_norm": 0.24930168688297272, + "learning_rate": 3.349767211300933e-06, + "loss": 0.027, + "step": 24880 + }, + { + "epoch": 0.711651179413867, + "grad_norm": 0.3934503495693207, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0256, + "step": 24890 + }, + { + "epoch": 0.7119370979270908, + "grad_norm": 0.7811068892478943, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.03, + "step": 24900 + }, + { + "epoch": 0.7122230164403145, + "grad_norm": 0.4274163246154785, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0263, + "step": 24910 + }, + { + "epoch": 0.7125089349535383, + "grad_norm": 0.5188158750534058, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0264, + "step": 24920 + }, + { + "epoch": 0.712794853466762, + "grad_norm": 0.4106016457080841, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0309, + "step": 24930 + }, + { + "epoch": 0.7130807719799857, + "grad_norm": 0.5283434987068176, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0335, + "step": 24940 + }, + { + "epoch": 0.7133666904932094, + "grad_norm": 0.38160789012908936, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0313, + "step": 24950 + }, + { + "epoch": 0.7136526090064331, + "grad_norm": 0.30552029609680176, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0265, + "step": 24960 + }, + { + "epoch": 0.7139385275196569, + "grad_norm": 0.40023618936538696, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0295, + "step": 24970 + }, + { + "epoch": 0.7142244460328806, + "grad_norm": 0.3569220006465912, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0259, + "step": 24980 + }, + { + "epoch": 0.7145103645461044, + "grad_norm": 0.39430442452430725, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0313, + "step": 24990 + }, + { + "epoch": 0.7147962830593281, + "grad_norm": 0.5891808271408081, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0272, + "step": 25000 + }, + { + "epoch": 0.7150822015725519, + "grad_norm": 0.487945556640625, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0308, + "step": 25010 + }, + { + "epoch": 0.7153681200857755, + "grad_norm": 0.551268458366394, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.026, + "step": 25020 + }, + { + "epoch": 0.7156540385989992, + "grad_norm": 0.7384896278381348, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0371, + "step": 25030 + }, + { + "epoch": 0.715939957112223, + "grad_norm": 0.43013718724250793, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0348, + "step": 25040 + }, + { + "epoch": 0.7162258756254467, + "grad_norm": 0.28747591376304626, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0249, + "step": 25050 + }, + { + "epoch": 0.7165117941386705, + "grad_norm": 0.48107975721359253, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0247, + "step": 25060 + }, + { + "epoch": 0.7167977126518942, + "grad_norm": 0.4077073931694031, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0313, + "step": 25070 + }, + { + "epoch": 0.717083631165118, + "grad_norm": 0.7853788137435913, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0286, + "step": 25080 + }, + { + "epoch": 0.7173695496783417, + "grad_norm": 0.6021899580955505, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0316, + "step": 25090 + }, + { + "epoch": 0.7176554681915654, + "grad_norm": 0.5997788906097412, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0286, + "step": 25100 + }, + { + "epoch": 0.7179413867047891, + "grad_norm": 0.47682714462280273, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0329, + "step": 25110 + }, + { + "epoch": 0.7182273052180128, + "grad_norm": 0.6501848697662354, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0289, + "step": 25120 + }, + { + "epoch": 0.7185132237312366, + "grad_norm": 1.000689148902893, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0351, + "step": 25130 + }, + { + "epoch": 0.7187991422444603, + "grad_norm": 0.5946705937385559, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0268, + "step": 25140 + }, + { + "epoch": 0.7190850607576841, + "grad_norm": 0.46967631578445435, + "learning_rate": 3.214397932123149e-06, + "loss": 0.031, + "step": 25150 + }, + { + "epoch": 0.7193709792709078, + "grad_norm": 1.052093744277954, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0302, + "step": 25160 + }, + { + "epoch": 0.7196568977841316, + "grad_norm": 0.9337649941444397, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0304, + "step": 25170 + }, + { + "epoch": 0.7199428162973552, + "grad_norm": 0.423648864030838, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0297, + "step": 25180 + }, + { + "epoch": 0.720228734810579, + "grad_norm": 0.46862924098968506, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.028, + "step": 25190 + }, + { + "epoch": 0.7205146533238027, + "grad_norm": 0.7099304795265198, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0306, + "step": 25200 + }, + { + "epoch": 0.7208005718370264, + "grad_norm": 0.5219885110855103, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0269, + "step": 25210 + }, + { + "epoch": 0.7210864903502502, + "grad_norm": 0.6347305774688721, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0307, + "step": 25220 + }, + { + "epoch": 0.7213724088634739, + "grad_norm": 0.7043943405151367, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0267, + "step": 25230 + }, + { + "epoch": 0.7216583273766977, + "grad_norm": 0.4137915074825287, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.028, + "step": 25240 + }, + { + "epoch": 0.7219442458899213, + "grad_norm": 0.4374844431877136, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0246, + "step": 25250 + }, + { + "epoch": 0.7222301644031451, + "grad_norm": 0.6796316504478455, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0285, + "step": 25260 + }, + { + "epoch": 0.7225160829163688, + "grad_norm": 0.4662792980670929, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0287, + "step": 25270 + }, + { + "epoch": 0.7228020014295926, + "grad_norm": 0.4035339653491974, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0289, + "step": 25280 + }, + { + "epoch": 0.7230879199428163, + "grad_norm": 0.40217533707618713, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0238, + "step": 25290 + }, + { + "epoch": 0.72337383845604, + "grad_norm": 0.3640667796134949, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0303, + "step": 25300 + }, + { + "epoch": 0.7236597569692638, + "grad_norm": 0.38176655769348145, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.0283, + "step": 25310 + }, + { + "epoch": 0.7239456754824874, + "grad_norm": 0.40747207403182983, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.031, + "step": 25320 + }, + { + "epoch": 0.7242315939957112, + "grad_norm": 0.3859431743621826, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0306, + "step": 25330 + }, + { + "epoch": 0.7245175125089349, + "grad_norm": 0.23738636076450348, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0313, + "step": 25340 + }, + { + "epoch": 0.7248034310221587, + "grad_norm": 0.3772980272769928, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0308, + "step": 25350 + }, + { + "epoch": 0.7250893495353824, + "grad_norm": 0.5451138019561768, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.03, + "step": 25360 + }, + { + "epoch": 0.7253752680486062, + "grad_norm": 0.6431843638420105, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0371, + "step": 25370 + }, + { + "epoch": 0.7256611865618299, + "grad_norm": 0.42552369832992554, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0311, + "step": 25380 + }, + { + "epoch": 0.7259471050750536, + "grad_norm": 0.5802433490753174, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0316, + "step": 25390 + }, + { + "epoch": 0.7262330235882773, + "grad_norm": 0.31489041447639465, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0282, + "step": 25400 + }, + { + "epoch": 0.726518942101501, + "grad_norm": 0.4227478504180908, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0274, + "step": 25410 + }, + { + "epoch": 0.7268048606147248, + "grad_norm": 0.5510851740837097, + "learning_rate": 3.085688933413021e-06, + "loss": 0.0297, + "step": 25420 + }, + { + "epoch": 0.7270907791279485, + "grad_norm": 0.3073323667049408, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0244, + "step": 25430 + }, + { + "epoch": 0.7273766976411723, + "grad_norm": 0.7394781112670898, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.028, + "step": 25440 + }, + { + "epoch": 0.727662616154396, + "grad_norm": 0.5067957639694214, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0374, + "step": 25450 + }, + { + "epoch": 0.7279485346676198, + "grad_norm": 0.4093882739543915, + "learning_rate": 3.067194157156521e-06, + "loss": 0.0347, + "step": 25460 + }, + { + "epoch": 0.7282344531808435, + "grad_norm": 0.37054866552352905, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0305, + "step": 25470 + }, + { + "epoch": 0.7285203716940671, + "grad_norm": 0.38795027136802673, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0282, + "step": 25480 + }, + { + "epoch": 0.7288062902072909, + "grad_norm": 0.49282407760620117, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0301, + "step": 25490 + }, + { + "epoch": 0.7290922087205146, + "grad_norm": 0.5234564542770386, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0352, + "step": 25500 + }, + { + "epoch": 0.7293781272337384, + "grad_norm": 0.5383297801017761, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0317, + "step": 25510 + }, + { + "epoch": 0.7296640457469621, + "grad_norm": 0.4277333617210388, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0278, + "step": 25520 + }, + { + "epoch": 0.7299499642601859, + "grad_norm": 0.6099430322647095, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0356, + "step": 25530 + }, + { + "epoch": 0.7302358827734096, + "grad_norm": 0.38870710134506226, + "learning_rate": 3.030651808761638e-06, + "loss": 0.027, + "step": 25540 + }, + { + "epoch": 0.7305218012866334, + "grad_norm": 0.48884090781211853, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0251, + "step": 25550 + }, + { + "epoch": 0.730807719799857, + "grad_norm": 0.5136672258377075, + "learning_rate": 3.021609639602321e-06, + "loss": 0.025, + "step": 25560 + }, + { + "epoch": 0.7310936383130807, + "grad_norm": 0.527056872844696, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.03, + "step": 25570 + }, + { + "epoch": 0.7313795568263045, + "grad_norm": 0.7081360220909119, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0303, + "step": 25580 + }, + { + "epoch": 0.7316654753395282, + "grad_norm": 0.48397257924079895, + "learning_rate": 3.008116622200155e-06, + "loss": 0.032, + "step": 25590 + }, + { + "epoch": 0.731951393852752, + "grad_norm": 0.38431495428085327, + "learning_rate": 3.003637700546652e-06, + "loss": 0.0337, + "step": 25600 + }, + { + "epoch": 0.7322373123659757, + "grad_norm": 0.48320460319519043, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0336, + "step": 25610 + }, + { + "epoch": 0.7325232308791995, + "grad_norm": 0.3164500892162323, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0244, + "step": 25620 + }, + { + "epoch": 0.7328091493924231, + "grad_norm": 0.5140587091445923, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0267, + "step": 25630 + }, + { + "epoch": 0.7330950679056469, + "grad_norm": 0.30739104747772217, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0373, + "step": 25640 + }, + { + "epoch": 0.7333809864188706, + "grad_norm": 0.3579956591129303, + "learning_rate": 2.981383959667165e-06, + "loss": 0.0328, + "step": 25650 + }, + { + "epoch": 0.7336669049320943, + "grad_norm": 0.7733256220817566, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0335, + "step": 25660 + }, + { + "epoch": 0.7339528234453181, + "grad_norm": 0.5355008244514465, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0291, + "step": 25670 + }, + { + "epoch": 0.7342387419585418, + "grad_norm": 0.5733621120452881, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0223, + "step": 25680 + }, + { + "epoch": 0.7345246604717656, + "grad_norm": 0.4484233260154724, + "learning_rate": 2.963750320724704e-06, + "loss": 0.03, + "step": 25690 + }, + { + "epoch": 0.7348105789849892, + "grad_norm": 0.46975597739219666, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0325, + "step": 25700 + }, + { + "epoch": 0.735096497498213, + "grad_norm": 0.4674699008464813, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0321, + "step": 25710 + }, + { + "epoch": 0.7353824160114367, + "grad_norm": 0.301565557718277, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0279, + "step": 25720 + }, + { + "epoch": 0.7356683345246605, + "grad_norm": 0.41966041922569275, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0319, + "step": 25730 + }, + { + "epoch": 0.7359542530378842, + "grad_norm": 0.5388277173042297, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0287, + "step": 25740 + }, + { + "epoch": 0.7362401715511079, + "grad_norm": 0.5821589231491089, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0298, + "step": 25750 + }, + { + "epoch": 0.7365260900643317, + "grad_norm": 0.9340733289718628, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0307, + "step": 25760 + }, + { + "epoch": 0.7368120085775554, + "grad_norm": 0.3654371201992035, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0284, + "step": 25770 + }, + { + "epoch": 0.7370979270907791, + "grad_norm": 0.38794293999671936, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0306, + "step": 25780 + }, + { + "epoch": 0.7373838456040028, + "grad_norm": 0.39955422282218933, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.0324, + "step": 25790 + }, + { + "epoch": 0.7376697641172266, + "grad_norm": 0.5864313244819641, + "learning_rate": 2.916036854664115e-06, + "loss": 0.031, + "step": 25800 + }, + { + "epoch": 0.7379556826304503, + "grad_norm": 0.4324203431606293, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0253, + "step": 25810 + }, + { + "epoch": 0.7382416011436741, + "grad_norm": 0.6346203684806824, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0278, + "step": 25820 + }, + { + "epoch": 0.7385275196568978, + "grad_norm": 0.3984649181365967, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0352, + "step": 25830 + }, + { + "epoch": 0.7388134381701215, + "grad_norm": 0.3954542577266693, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0305, + "step": 25840 + }, + { + "epoch": 0.7390993566833453, + "grad_norm": 0.3119542598724365, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0372, + "step": 25850 + }, + { + "epoch": 0.7393852751965689, + "grad_norm": 0.4094623029232025, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0251, + "step": 25860 + }, + { + "epoch": 0.7396711937097927, + "grad_norm": 0.5250104665756226, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0302, + "step": 25870 + }, + { + "epoch": 0.7399571122230164, + "grad_norm": 0.7610230445861816, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0257, + "step": 25880 + }, + { + "epoch": 0.7402430307362402, + "grad_norm": 0.5546014904975891, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0249, + "step": 25890 + }, + { + "epoch": 0.7405289492494639, + "grad_norm": 0.22835634648799896, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0287, + "step": 25900 + }, + { + "epoch": 0.7408148677626877, + "grad_norm": 0.7073826789855957, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0409, + "step": 25910 + }, + { + "epoch": 0.7411007862759114, + "grad_norm": 0.604634165763855, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0293, + "step": 25920 + }, + { + "epoch": 0.741386704789135, + "grad_norm": 0.46605581045150757, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0261, + "step": 25930 + }, + { + "epoch": 0.7416726233023588, + "grad_norm": 0.35719090700149536, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0304, + "step": 25940 + }, + { + "epoch": 0.7419585418155825, + "grad_norm": 0.3806651532649994, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0313, + "step": 25950 + }, + { + "epoch": 0.7422444603288063, + "grad_norm": 0.6443240642547607, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0303, + "step": 25960 + }, + { + "epoch": 0.74253037884203, + "grad_norm": 0.42187514901161194, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0282, + "step": 25970 + }, + { + "epoch": 0.7428162973552538, + "grad_norm": 0.4213440418243408, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0312, + "step": 25980 + }, + { + "epoch": 0.7431022158684775, + "grad_norm": 0.3982003331184387, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0279, + "step": 25990 + }, + { + "epoch": 0.7433881343817013, + "grad_norm": 0.3418596386909485, + "learning_rate": 2.832230653119002e-06, + "loss": 0.0318, + "step": 26000 + }, + { + "epoch": 0.7436740528949249, + "grad_norm": 0.3633996844291687, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0301, + "step": 26010 + }, + { + "epoch": 0.7439599714081486, + "grad_norm": 0.362079918384552, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.028, + "step": 26020 + }, + { + "epoch": 0.7442458899213724, + "grad_norm": 0.4734862744808197, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.031, + "step": 26030 + }, + { + "epoch": 0.7445318084345961, + "grad_norm": 0.31540775299072266, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0287, + "step": 26040 + }, + { + "epoch": 0.7448177269478199, + "grad_norm": 0.6774418950080872, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.029, + "step": 26050 + }, + { + "epoch": 0.7451036454610436, + "grad_norm": 0.3063428997993469, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0308, + "step": 26060 + }, + { + "epoch": 0.7453895639742674, + "grad_norm": 0.691943347454071, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.0265, + "step": 26070 + }, + { + "epoch": 0.745675482487491, + "grad_norm": 0.5507379174232483, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0286, + "step": 26080 + }, + { + "epoch": 0.7459614010007148, + "grad_norm": 0.34355828166007996, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.024, + "step": 26090 + }, + { + "epoch": 0.7462473195139385, + "grad_norm": 0.5120819807052612, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0288, + "step": 26100 + }, + { + "epoch": 0.7465332380271622, + "grad_norm": 0.5197821259498596, + "learning_rate": 2.78776903555923e-06, + "loss": 0.028, + "step": 26110 + }, + { + "epoch": 0.746819156540386, + "grad_norm": 0.46328091621398926, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.0247, + "step": 26120 + }, + { + "epoch": 0.7471050750536097, + "grad_norm": 0.6205909848213196, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0306, + "step": 26130 + }, + { + "epoch": 0.7473909935668335, + "grad_norm": 0.4201740622520447, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.025, + "step": 26140 + }, + { + "epoch": 0.7476769120800572, + "grad_norm": 0.23724111914634705, + "learning_rate": 2.771889969647e-06, + "loss": 0.0283, + "step": 26150 + }, + { + "epoch": 0.747962830593281, + "grad_norm": 0.8046770691871643, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0318, + "step": 26160 + }, + { + "epoch": 0.7482487491065046, + "grad_norm": 0.5273832082748413, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0342, + "step": 26170 + }, + { + "epoch": 0.7485346676197284, + "grad_norm": 0.923651397228241, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0255, + "step": 26180 + }, + { + "epoch": 0.7488205861329521, + "grad_norm": 0.6395840644836426, + "learning_rate": 2.756165401834933e-06, + "loss": 0.0277, + "step": 26190 + }, + { + "epoch": 0.7491065046461758, + "grad_norm": 0.44334620237350464, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.0285, + "step": 26200 + }, + { + "epoch": 0.7493924231593996, + "grad_norm": 0.47904232144355774, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0282, + "step": 26210 + }, + { + "epoch": 0.7496783416726233, + "grad_norm": 0.9316203594207764, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0307, + "step": 26220 + }, + { + "epoch": 0.749964260185847, + "grad_norm": 0.5045170783996582, + "learning_rate": 2.740595627381096e-06, + "loss": 0.0242, + "step": 26230 + }, + { + "epoch": 0.7502501786990707, + "grad_norm": 0.54493248462677, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0263, + "step": 26240 + }, + { + "epoch": 0.7505360972122945, + "grad_norm": 0.6128116846084595, + "learning_rate": 2.732868879137055e-06, + "loss": 0.0305, + "step": 26250 + }, + { + "epoch": 0.7508220157255182, + "grad_norm": 0.6235067844390869, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.037, + "step": 26260 + }, + { + "epoch": 0.751107934238742, + "grad_norm": 0.43458008766174316, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0274, + "step": 26270 + }, + { + "epoch": 0.7513938527519657, + "grad_norm": 0.5540400147438049, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0289, + "step": 26280 + }, + { + "epoch": 0.7516797712651894, + "grad_norm": 0.4317619204521179, + "learning_rate": 2.717531841969889e-06, + "loss": 0.0313, + "step": 26290 + }, + { + "epoch": 0.7519656897784132, + "grad_norm": 0.42271071672439575, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0291, + "step": 26300 + }, + { + "epoch": 0.7522516082916368, + "grad_norm": 0.6096150875091553, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0374, + "step": 26310 + }, + { + "epoch": 0.7525375268048606, + "grad_norm": 0.5820568799972534, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.027, + "step": 26320 + }, + { + "epoch": 0.7528234453180843, + "grad_norm": 0.4441884756088257, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0258, + "step": 26330 + }, + { + "epoch": 0.7531093638313081, + "grad_norm": 0.48442211747169495, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.0257, + "step": 26340 + }, + { + "epoch": 0.7533952823445318, + "grad_norm": 0.7179747223854065, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0281, + "step": 26350 + }, + { + "epoch": 0.7536812008577556, + "grad_norm": 0.5399336218833923, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.03, + "step": 26360 + }, + { + "epoch": 0.7539671193709793, + "grad_norm": 0.5521562099456787, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.0267, + "step": 26370 + }, + { + "epoch": 0.754253037884203, + "grad_norm": 0.3727903366088867, + "learning_rate": 2.683592557860616e-06, + "loss": 0.0274, + "step": 26380 + }, + { + "epoch": 0.7545389563974267, + "grad_norm": 0.5607078671455383, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.033, + "step": 26390 + }, + { + "epoch": 0.7548248749106504, + "grad_norm": 0.3736121654510498, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.0267, + "step": 26400 + }, + { + "epoch": 0.7551107934238742, + "grad_norm": 0.47778844833374023, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0246, + "step": 26410 + }, + { + "epoch": 0.7553967119370979, + "grad_norm": 0.5479125380516052, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.0273, + "step": 26420 + }, + { + "epoch": 0.7556826304503217, + "grad_norm": 0.5152542591094971, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0288, + "step": 26430 + }, + { + "epoch": 0.7559685489635454, + "grad_norm": 0.38652661442756653, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0319, + "step": 26440 + }, + { + "epoch": 0.7562544674767692, + "grad_norm": 0.8551011085510254, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.0312, + "step": 26450 + }, + { + "epoch": 0.7565403859899928, + "grad_norm": 0.5332438349723816, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0348, + "step": 26460 + }, + { + "epoch": 0.7568263045032165, + "grad_norm": 0.5529776215553284, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.0305, + "step": 26470 + }, + { + "epoch": 0.7571122230164403, + "grad_norm": 0.47610723972320557, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.027, + "step": 26480 + }, + { + "epoch": 0.757398141529664, + "grad_norm": 0.5565681457519531, + "learning_rate": 2.643185095075473e-06, + "loss": 0.0277, + "step": 26490 + }, + { + "epoch": 0.7576840600428878, + "grad_norm": 0.40319734811782837, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0287, + "step": 26500 + }, + { + "epoch": 0.7579699785561115, + "grad_norm": 0.5117385387420654, + "learning_rate": 2.635965610168249e-06, + "loss": 0.0312, + "step": 26510 + }, + { + "epoch": 0.7582558970693353, + "grad_norm": 0.47812822461128235, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0287, + "step": 26520 + }, + { + "epoch": 0.758541815582559, + "grad_norm": 0.24216991662979126, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0321, + "step": 26530 + }, + { + "epoch": 0.7588277340957827, + "grad_norm": 0.24864375591278076, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0312, + "step": 26540 + }, + { + "epoch": 0.7591136526090064, + "grad_norm": 0.39162659645080566, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0333, + "step": 26550 + }, + { + "epoch": 0.7593995711222301, + "grad_norm": 0.30692365765571594, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.0261, + "step": 26560 + }, + { + "epoch": 0.7596854896354539, + "grad_norm": 0.5904929041862488, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0309, + "step": 26570 + }, + { + "epoch": 0.7599714081486776, + "grad_norm": 0.5509836673736572, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0223, + "step": 26580 + }, + { + "epoch": 0.7602573266619014, + "grad_norm": 0.45913293957710266, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0338, + "step": 26590 + }, + { + "epoch": 0.7605432451751251, + "grad_norm": 0.3952873647212982, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.0283, + "step": 26600 + }, + { + "epoch": 0.7608291636883489, + "grad_norm": 0.49259039759635925, + "learning_rate": 2.600457796416397e-06, + "loss": 0.0262, + "step": 26610 + }, + { + "epoch": 0.7611150822015725, + "grad_norm": 0.49096909165382385, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.0265, + "step": 26620 + }, + { + "epoch": 0.7614010007147963, + "grad_norm": 0.48913729190826416, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0347, + "step": 26630 + }, + { + "epoch": 0.76168691922802, + "grad_norm": 0.391233891248703, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0281, + "step": 26640 + }, + { + "epoch": 0.7619728377412437, + "grad_norm": 0.3726404011249542, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0252, + "step": 26650 + }, + { + "epoch": 0.7622587562544675, + "grad_norm": 0.441919207572937, + "learning_rate": 2.583073279935805e-06, + "loss": 0.025, + "step": 26660 + }, + { + "epoch": 0.7625446747676912, + "grad_norm": 0.6720325350761414, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.0264, + "step": 26670 + }, + { + "epoch": 0.762830593280915, + "grad_norm": 0.4706156849861145, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0307, + "step": 26680 + }, + { + "epoch": 0.7631165117941386, + "grad_norm": 0.6154748797416687, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0283, + "step": 26690 + }, + { + "epoch": 0.7634024303073624, + "grad_norm": 0.4765104651451111, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.0292, + "step": 26700 + }, + { + "epoch": 0.7636883488205861, + "grad_norm": 0.33775731921195984, + "learning_rate": 2.565935706183804e-06, + "loss": 0.0281, + "step": 26710 + }, + { + "epoch": 0.7639742673338099, + "grad_norm": 0.9325317144393921, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0282, + "step": 26720 + }, + { + "epoch": 0.7642601858470336, + "grad_norm": 0.5118368864059448, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0264, + "step": 26730 + }, + { + "epoch": 0.7645461043602573, + "grad_norm": 0.6633817553520203, + "learning_rate": 2.555771903907403e-06, + "loss": 0.035, + "step": 26740 + }, + { + "epoch": 0.7648320228734811, + "grad_norm": 0.8666901588439941, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0299, + "step": 26750 + }, + { + "epoch": 0.7651179413867047, + "grad_norm": 0.47465914487838745, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0281, + "step": 26760 + }, + { + "epoch": 0.7654038598999285, + "grad_norm": 0.5317928791046143, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0237, + "step": 26770 + }, + { + "epoch": 0.7656897784131522, + "grad_norm": 0.6626484394073486, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.0297, + "step": 26780 + }, + { + "epoch": 0.765975696926376, + "grad_norm": 0.5603852272033691, + "learning_rate": 2.5390304813179e-06, + "loss": 0.0279, + "step": 26790 + }, + { + "epoch": 0.7662616154395997, + "grad_norm": 0.392030268907547, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.0276, + "step": 26800 + }, + { + "epoch": 0.7665475339528235, + "grad_norm": 0.5270085334777832, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0283, + "step": 26810 + }, + { + "epoch": 0.7668334524660472, + "grad_norm": 0.5256703495979309, + "learning_rate": 2.529104749380281e-06, + "loss": 0.029, + "step": 26820 + }, + { + "epoch": 0.7671193709792709, + "grad_norm": 0.3960905075073242, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0297, + "step": 26830 + }, + { + "epoch": 0.7674052894924946, + "grad_norm": 0.4214257597923279, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0279, + "step": 26840 + }, + { + "epoch": 0.7676912080057183, + "grad_norm": 0.4516659677028656, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0268, + "step": 26850 + }, + { + "epoch": 0.7679771265189421, + "grad_norm": 0.4527135193347931, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0285, + "step": 26860 + }, + { + "epoch": 0.7682630450321658, + "grad_norm": 0.4458029270172119, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0282, + "step": 26870 + }, + { + "epoch": 0.7685489635453896, + "grad_norm": 0.5262351036071777, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.0289, + "step": 26880 + }, + { + "epoch": 0.7688348820586133, + "grad_norm": 0.7576776146888733, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0304, + "step": 26890 + }, + { + "epoch": 0.7691208005718371, + "grad_norm": 0.3779038190841675, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0308, + "step": 26900 + }, + { + "epoch": 0.7694067190850608, + "grad_norm": 0.5801526308059692, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0279, + "step": 26910 + }, + { + "epoch": 0.7696926375982844, + "grad_norm": 0.6423588991165161, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0291, + "step": 26920 + }, + { + "epoch": 0.7699785561115082, + "grad_norm": 0.3891446590423584, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0276, + "step": 26930 + }, + { + "epoch": 0.7702644746247319, + "grad_norm": 0.6453003883361816, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0297, + "step": 26940 + }, + { + "epoch": 0.7705503931379557, + "grad_norm": 0.5512704253196716, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0273, + "step": 26950 + }, + { + "epoch": 0.7708363116511794, + "grad_norm": 0.5719016790390015, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0284, + "step": 26960 + }, + { + "epoch": 0.7711222301644032, + "grad_norm": 0.325624942779541, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0249, + "step": 26970 + }, + { + "epoch": 0.7714081486776269, + "grad_norm": 0.5242589712142944, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0268, + "step": 26980 + }, + { + "epoch": 0.7716940671908507, + "grad_norm": 0.3835712969303131, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.0293, + "step": 26990 + }, + { + "epoch": 0.7719799857040743, + "grad_norm": 0.5894249081611633, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0303, + "step": 27000 + }, + { + "epoch": 0.772265904217298, + "grad_norm": 0.4519590437412262, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.0252, + "step": 27010 + }, + { + "epoch": 0.7725518227305218, + "grad_norm": 0.590528130531311, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0297, + "step": 27020 + }, + { + "epoch": 0.7728377412437455, + "grad_norm": 0.5418447852134705, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0306, + "step": 27030 + }, + { + "epoch": 0.7731236597569693, + "grad_norm": 1.027212142944336, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.0302, + "step": 27040 + }, + { + "epoch": 0.773409578270193, + "grad_norm": 0.5057966709136963, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.0295, + "step": 27050 + }, + { + "epoch": 0.7736954967834168, + "grad_norm": 0.9749689698219299, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.0288, + "step": 27060 + }, + { + "epoch": 0.7739814152966404, + "grad_norm": 0.7263986468315125, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.03, + "step": 27070 + }, + { + "epoch": 0.7742673338098642, + "grad_norm": 0.6080947518348694, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0312, + "step": 27080 + }, + { + "epoch": 0.7745532523230879, + "grad_norm": 0.5187621712684631, + "learning_rate": 2.443811559007335e-06, + "loss": 0.0235, + "step": 27090 + }, + { + "epoch": 0.7748391708363116, + "grad_norm": 0.6019864678382874, + "learning_rate": 2.440792688039862e-06, + "loss": 0.0356, + "step": 27100 + }, + { + "epoch": 0.7751250893495354, + "grad_norm": 0.4716169238090515, + "learning_rate": 2.437783861778914e-06, + "loss": 0.0241, + "step": 27110 + }, + { + "epoch": 0.7754110078627591, + "grad_norm": 0.2648717761039734, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.027, + "step": 27120 + }, + { + "epoch": 0.7756969263759829, + "grad_norm": 0.43119028210639954, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0278, + "step": 27130 + }, + { + "epoch": 0.7759828448892065, + "grad_norm": 0.37466534972190857, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0254, + "step": 27140 + }, + { + "epoch": 0.7762687634024303, + "grad_norm": 0.36353442072868347, + "learning_rate": 2.425849074243997e-06, + "loss": 0.0263, + "step": 27150 + }, + { + "epoch": 0.776554681915654, + "grad_norm": 0.35461705923080444, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0281, + "step": 27160 + }, + { + "epoch": 0.7768406004288778, + "grad_norm": 0.5017783045768738, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0312, + "step": 27170 + }, + { + "epoch": 0.7771265189421015, + "grad_norm": 0.461370050907135, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0301, + "step": 27180 + }, + { + "epoch": 0.7774124374553252, + "grad_norm": 0.3844483494758606, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0276, + "step": 27190 + }, + { + "epoch": 0.777698355968549, + "grad_norm": 0.32640641927719116, + "learning_rate": 2.411157015949963e-06, + "loss": 0.0262, + "step": 27200 + }, + { + "epoch": 0.7779842744817727, + "grad_norm": 0.6539550423622131, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0303, + "step": 27210 + }, + { + "epoch": 0.7782701929949964, + "grad_norm": 0.5505805015563965, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0267, + "step": 27220 + }, + { + "epoch": 0.7785561115082201, + "grad_norm": 0.433768630027771, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0299, + "step": 27230 + }, + { + "epoch": 0.7788420300214439, + "grad_norm": 0.7262346148490906, + "learning_rate": 2.399584779188417e-06, + "loss": 0.0278, + "step": 27240 + }, + { + "epoch": 0.7791279485346676, + "grad_norm": 0.6827511787414551, + "learning_rate": 2.396716944205467e-06, + "loss": 0.0319, + "step": 27250 + }, + { + "epoch": 0.7794138670478914, + "grad_norm": 0.3138200342655182, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.0261, + "step": 27260 + }, + { + "epoch": 0.7796997855611151, + "grad_norm": 0.36588770151138306, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0251, + "step": 27270 + }, + { + "epoch": 0.7799857040743388, + "grad_norm": 1.105770468711853, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0272, + "step": 27280 + }, + { + "epoch": 0.7802716225875626, + "grad_norm": 0.4482360780239105, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.0247, + "step": 27290 + }, + { + "epoch": 0.7805575411007862, + "grad_norm": 0.5545430779457092, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0307, + "step": 27300 + }, + { + "epoch": 0.78084345961401, + "grad_norm": 0.45449620485305786, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0264, + "step": 27310 + }, + { + "epoch": 0.7811293781272337, + "grad_norm": 0.37734025716781616, + "learning_rate": 2.376924986395271e-06, + "loss": 0.0275, + "step": 27320 + }, + { + "epoch": 0.7814152966404575, + "grad_norm": 0.47029784321784973, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0325, + "step": 27330 + }, + { + "epoch": 0.7817012151536812, + "grad_norm": 0.3540012240409851, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0258, + "step": 27340 + }, + { + "epoch": 0.781987133666905, + "grad_norm": 0.8363472819328308, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0288, + "step": 27350 + }, + { + "epoch": 0.7822730521801287, + "grad_norm": 0.5943127274513245, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.0289, + "step": 27360 + }, + { + "epoch": 0.7825589706933523, + "grad_norm": 0.48346707224845886, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0304, + "step": 27370 + }, + { + "epoch": 0.7828448892065761, + "grad_norm": 0.5776712894439697, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.0262, + "step": 27380 + }, + { + "epoch": 0.7831308077197998, + "grad_norm": 0.37524285912513733, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0342, + "step": 27390 + }, + { + "epoch": 0.7834167262330236, + "grad_norm": 0.4272121787071228, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.0262, + "step": 27400 + }, + { + "epoch": 0.7837026447462473, + "grad_norm": 0.3545357286930084, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.0273, + "step": 27410 + }, + { + "epoch": 0.7839885632594711, + "grad_norm": 0.4780922830104828, + "learning_rate": 2.349511203900333e-06, + "loss": 0.0255, + "step": 27420 + }, + { + "epoch": 0.7842744817726948, + "grad_norm": 0.6846514940261841, + "learning_rate": 2.3468256081258e-06, + "loss": 0.035, + "step": 27430 + }, + { + "epoch": 0.7845604002859186, + "grad_norm": 0.6890650391578674, + "learning_rate": 2.344150167333397e-06, + "loss": 0.0305, + "step": 27440 + }, + { + "epoch": 0.7848463187991422, + "grad_norm": 0.41689804196357727, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0275, + "step": 27450 + }, + { + "epoch": 0.7851322373123659, + "grad_norm": 0.5169947743415833, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0261, + "step": 27460 + }, + { + "epoch": 0.7854181558255897, + "grad_norm": 0.3667839467525482, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0267, + "step": 27470 + }, + { + "epoch": 0.7857040743388134, + "grad_norm": 0.4650583267211914, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0259, + "step": 27480 + }, + { + "epoch": 0.7859899928520372, + "grad_norm": 0.5303590297698975, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0318, + "step": 27490 + }, + { + "epoch": 0.7862759113652609, + "grad_norm": 0.38010939955711365, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.0292, + "step": 27500 + }, + { + "epoch": 0.7865618298784847, + "grad_norm": 0.5952475070953369, + "learning_rate": 2.325706683525094e-06, + "loss": 0.0265, + "step": 27510 + }, + { + "epoch": 0.7868477483917083, + "grad_norm": 0.34000876545906067, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0255, + "step": 27520 + }, + { + "epoch": 0.7871336669049321, + "grad_norm": 0.333310604095459, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0266, + "step": 27530 + }, + { + "epoch": 0.7874195854181558, + "grad_norm": 1.0167195796966553, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0303, + "step": 27540 + }, + { + "epoch": 0.7877055039313795, + "grad_norm": 0.506395697593689, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0255, + "step": 27550 + }, + { + "epoch": 0.7879914224446033, + "grad_norm": 0.4995521008968353, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.0232, + "step": 27560 + }, + { + "epoch": 0.788277340957827, + "grad_norm": 0.592944324016571, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0271, + "step": 27570 + }, + { + "epoch": 0.7885632594710508, + "grad_norm": 0.5690013766288757, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0349, + "step": 27580 + }, + { + "epoch": 0.7888491779842745, + "grad_norm": 0.5303569436073303, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0301, + "step": 27590 + }, + { + "epoch": 0.7891350964974982, + "grad_norm": 0.4314960539340973, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0266, + "step": 27600 + }, + { + "epoch": 0.7894210150107219, + "grad_norm": 0.4138862192630768, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0237, + "step": 27610 + }, + { + "epoch": 0.7897069335239457, + "grad_norm": 0.5151752829551697, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.0268, + "step": 27620 + }, + { + "epoch": 0.7899928520371694, + "grad_norm": 0.7513082027435303, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.031, + "step": 27630 + }, + { + "epoch": 0.7902787705503931, + "grad_norm": 0.2644256055355072, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0259, + "step": 27640 + }, + { + "epoch": 0.7905646890636169, + "grad_norm": 0.5767413377761841, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0312, + "step": 27650 + }, + { + "epoch": 0.7908506075768406, + "grad_norm": 0.4754960536956787, + "learning_rate": 2.287865908463585e-06, + "loss": 0.035, + "step": 27660 + }, + { + "epoch": 0.7911365260900644, + "grad_norm": 0.4080045521259308, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.0271, + "step": 27670 + }, + { + "epoch": 0.791422444603288, + "grad_norm": 0.3843805193901062, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0312, + "step": 27680 + }, + { + "epoch": 0.7917083631165118, + "grad_norm": 0.3925490975379944, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0242, + "step": 27690 + }, + { + "epoch": 0.7919942816297355, + "grad_norm": 0.3966064155101776, + "learning_rate": 2.278163146933236e-06, + "loss": 0.0257, + "step": 27700 + }, + { + "epoch": 0.7922802001429593, + "grad_norm": 0.6077889204025269, + "learning_rate": 2.275763038367336e-06, + "loss": 0.0238, + "step": 27710 + }, + { + "epoch": 0.792566118656183, + "grad_norm": 0.6053628921508789, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0356, + "step": 27720 + }, + { + "epoch": 0.7928520371694067, + "grad_norm": 0.49703511595726013, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0283, + "step": 27730 + }, + { + "epoch": 0.7931379556826305, + "grad_norm": 0.5619977712631226, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0272, + "step": 27740 + }, + { + "epoch": 0.7934238741958541, + "grad_norm": 0.6108564734458923, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0271, + "step": 27750 + }, + { + "epoch": 0.7937097927090779, + "grad_norm": 0.4029979109764099, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0265, + "step": 27760 + }, + { + "epoch": 0.7939957112223016, + "grad_norm": 0.45793306827545166, + "learning_rate": 2.261577490652103e-06, + "loss": 0.0229, + "step": 27770 + }, + { + "epoch": 0.7942816297355254, + "grad_norm": 0.433551162481308, + "learning_rate": 2.259249109209093e-06, + "loss": 0.0264, + "step": 27780 + }, + { + "epoch": 0.7945675482487491, + "grad_norm": 0.4247429072856903, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0269, + "step": 27790 + }, + { + "epoch": 0.7948534667619729, + "grad_norm": 0.4973151981830597, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.0281, + "step": 27800 + }, + { + "epoch": 0.7951393852751966, + "grad_norm": 0.5111087560653687, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.0267, + "step": 27810 + }, + { + "epoch": 0.7954253037884202, + "grad_norm": 0.5530220866203308, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0291, + "step": 27820 + }, + { + "epoch": 0.795711222301644, + "grad_norm": 0.4368492662906647, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0302, + "step": 27830 + }, + { + "epoch": 0.7959971408148677, + "grad_norm": 0.5381907820701599, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0253, + "step": 27840 + }, + { + "epoch": 0.7962830593280915, + "grad_norm": 0.3638664186000824, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.0258, + "step": 27850 + }, + { + "epoch": 0.7965689778413152, + "grad_norm": 0.38014277815818787, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.0279, + "step": 27860 + }, + { + "epoch": 0.796854896354539, + "grad_norm": 0.46882548928260803, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.0272, + "step": 27870 + }, + { + "epoch": 0.7971408148677627, + "grad_norm": 0.4826337397098541, + "learning_rate": 2.236529916369313e-06, + "loss": 0.027, + "step": 27880 + }, + { + "epoch": 0.7974267333809865, + "grad_norm": 0.7986114621162415, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0263, + "step": 27890 + }, + { + "epoch": 0.7977126518942101, + "grad_norm": 0.5447944402694702, + "learning_rate": 2.232109406453595e-06, + "loss": 0.0321, + "step": 27900 + }, + { + "epoch": 0.7979985704074338, + "grad_norm": 0.21586239337921143, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0241, + "step": 27910 + }, + { + "epoch": 0.7982844889206576, + "grad_norm": 0.8066816926002502, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0285, + "step": 27920 + }, + { + "epoch": 0.7985704074338813, + "grad_norm": 0.5516615509986877, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0302, + "step": 27930 + }, + { + "epoch": 0.7988563259471051, + "grad_norm": 0.6859652996063232, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0244, + "step": 27940 + }, + { + "epoch": 0.7991422444603288, + "grad_norm": 0.5234702229499817, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0272, + "step": 27950 + }, + { + "epoch": 0.7994281629735526, + "grad_norm": 0.32633450627326965, + "learning_rate": 2.219094909262834e-06, + "loss": 0.0249, + "step": 27960 + }, + { + "epoch": 0.7997140814867763, + "grad_norm": 0.5086314678192139, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0278, + "step": 27970 + }, + { + "epoch": 0.8, + "grad_norm": 0.40988171100616455, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.0297, + "step": 27980 + }, + { + "epoch": 0.8002859185132237, + "grad_norm": 0.4648076891899109, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0271, + "step": 27990 + }, + { + "epoch": 0.8005718370264474, + "grad_norm": 0.7577387690544128, + "learning_rate": 2.210624641425579e-06, + "loss": 0.0328, + "step": 28000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.748988931866624e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48788e9db87ec0a1f3c57369f97599281bb6ff59 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e236c468d9fe78565b222177a5c1455250ac5e838df3cef20e7b974cdf5175 +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..281677a827cfc618245396084c1f660afffe3179 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ce617a8c790ee1ea829e1ced51561b76027197dfcb3de42b9d9fb8d16fda107 +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..114ee897cfc500507adcc5388cadcd6956a4a822 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccfabc16525ea4059e9cdabb977d5a0d0c668b97d1724617738818c4691368ae +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aa02143bc160986b180a7297557f57b9878ad0e1 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json @@ -0,0 +1,21034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8577555396711937, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + }, + { + "epoch": 0.40057183702644744, + "grad_norm": 0.9077253937721252, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0415, + "step": 14010 + }, + { + "epoch": 0.4008577555396712, + "grad_norm": 0.9126781225204468, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.048, + "step": 14020 + }, + { + "epoch": 0.4011436740528949, + "grad_norm": 0.6264623999595642, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0411, + "step": 14030 + }, + { + "epoch": 0.40142959256611865, + "grad_norm": 0.523853600025177, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.051, + "step": 14040 + }, + { + "epoch": 0.4017155110793424, + "grad_norm": 0.6340035200119019, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0426, + "step": 14050 + }, + { + "epoch": 0.40200142959256613, + "grad_norm": 0.3594725430011749, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0397, + "step": 14060 + }, + { + "epoch": 0.40228734810578987, + "grad_norm": 0.941470742225647, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0402, + "step": 14070 + }, + { + "epoch": 0.4025732666190136, + "grad_norm": 0.840506911277771, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0473, + "step": 14080 + }, + { + "epoch": 0.4028591851322373, + "grad_norm": 0.3359200954437256, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0405, + "step": 14090 + }, + { + "epoch": 0.403145103645461, + "grad_norm": 0.49658629298210144, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0464, + "step": 14100 + }, + { + "epoch": 0.40343102215868476, + "grad_norm": 0.7940187454223633, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0417, + "step": 14110 + }, + { + "epoch": 0.4037169406719085, + "grad_norm": 0.30110660195350647, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0371, + "step": 14120 + }, + { + "epoch": 0.40400285918513223, + "grad_norm": 0.42845240235328674, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.053, + "step": 14130 + }, + { + "epoch": 0.40428877769835597, + "grad_norm": 0.997348427772522, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.041, + "step": 14140 + }, + { + "epoch": 0.4045746962115797, + "grad_norm": 0.4759966731071472, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0377, + "step": 14150 + }, + { + "epoch": 0.40486061472480345, + "grad_norm": 0.42045602202415466, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0397, + "step": 14160 + }, + { + "epoch": 0.4051465332380272, + "grad_norm": 0.6400002837181091, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0507, + "step": 14170 + }, + { + "epoch": 0.40543245175125087, + "grad_norm": 0.5473673939704895, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0359, + "step": 14180 + }, + { + "epoch": 0.4057183702644746, + "grad_norm": 0.7414730787277222, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0416, + "step": 14190 + }, + { + "epoch": 0.40600428877769834, + "grad_norm": 0.4691861867904663, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0363, + "step": 14200 + }, + { + "epoch": 0.4062902072909221, + "grad_norm": 0.9186112880706787, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0445, + "step": 14210 + }, + { + "epoch": 0.4065761258041458, + "grad_norm": 0.6782190203666687, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.40686204431736955, + "grad_norm": 0.6948013305664062, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.037, + "step": 14230 + }, + { + "epoch": 0.4071479628305933, + "grad_norm": 0.3034680485725403, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0371, + "step": 14240 + }, + { + "epoch": 0.40743388134381703, + "grad_norm": 0.4254174828529358, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0449, + "step": 14250 + }, + { + "epoch": 0.40771979985704077, + "grad_norm": 1.3622064590454102, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0428, + "step": 14260 + }, + { + "epoch": 0.40800571837026445, + "grad_norm": 0.5928359031677246, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0424, + "step": 14270 + }, + { + "epoch": 0.4082916368834882, + "grad_norm": 0.9103132486343384, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0414, + "step": 14280 + }, + { + "epoch": 0.4085775553967119, + "grad_norm": 0.6338028311729431, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0376, + "step": 14290 + }, + { + "epoch": 0.40886347390993566, + "grad_norm": 0.9920284748077393, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0393, + "step": 14300 + }, + { + "epoch": 0.4091493924231594, + "grad_norm": 0.411830335855484, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0336, + "step": 14310 + }, + { + "epoch": 0.40943531093638313, + "grad_norm": 0.6977682709693909, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0454, + "step": 14320 + }, + { + "epoch": 0.40972122944960687, + "grad_norm": 0.6303663849830627, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0453, + "step": 14330 + }, + { + "epoch": 0.4100071479628306, + "grad_norm": 0.3048207759857178, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0373, + "step": 14340 + }, + { + "epoch": 0.41029306647605435, + "grad_norm": 0.7683395743370056, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0438, + "step": 14350 + }, + { + "epoch": 0.41057898498927803, + "grad_norm": 0.5791511535644531, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0392, + "step": 14360 + }, + { + "epoch": 0.41086490350250177, + "grad_norm": 0.876626193523407, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0324, + "step": 14370 + }, + { + "epoch": 0.4111508220157255, + "grad_norm": 0.5971815586090088, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0368, + "step": 14380 + }, + { + "epoch": 0.41143674052894924, + "grad_norm": 0.6508862376213074, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0411, + "step": 14390 + }, + { + "epoch": 0.411722659042173, + "grad_norm": 0.4704359471797943, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0351, + "step": 14400 + }, + { + "epoch": 0.4120085775553967, + "grad_norm": 0.4266453683376312, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0367, + "step": 14410 + }, + { + "epoch": 0.41229449606862045, + "grad_norm": 0.5898434519767761, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0376, + "step": 14420 + }, + { + "epoch": 0.4125804145818442, + "grad_norm": 0.8741532564163208, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0419, + "step": 14430 + }, + { + "epoch": 0.41286633309506793, + "grad_norm": 0.24328190088272095, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0333, + "step": 14440 + }, + { + "epoch": 0.4131522516082916, + "grad_norm": 0.4263601303100586, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.039, + "step": 14450 + }, + { + "epoch": 0.41343817012151535, + "grad_norm": 0.6311615109443665, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0454, + "step": 14460 + }, + { + "epoch": 0.4137240886347391, + "grad_norm": 0.7424519658088684, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0392, + "step": 14470 + }, + { + "epoch": 0.4140100071479628, + "grad_norm": 0.48323145508766174, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0374, + "step": 14480 + }, + { + "epoch": 0.41429592566118656, + "grad_norm": 0.38597407937049866, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0393, + "step": 14490 + }, + { + "epoch": 0.4145818441744103, + "grad_norm": 0.7251518964767456, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0431, + "step": 14500 + }, + { + "epoch": 0.41486776268763403, + "grad_norm": 0.44361060857772827, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0426, + "step": 14510 + }, + { + "epoch": 0.41515368120085777, + "grad_norm": 0.5625014305114746, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0372, + "step": 14520 + }, + { + "epoch": 0.4154395997140815, + "grad_norm": 0.27855798602104187, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 0.4157255182273052, + "grad_norm": 0.5966296195983887, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0387, + "step": 14540 + }, + { + "epoch": 0.41601143674052893, + "grad_norm": 0.49445512890815735, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0355, + "step": 14550 + }, + { + "epoch": 0.41629735525375267, + "grad_norm": 0.3813278377056122, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0456, + "step": 14560 + }, + { + "epoch": 0.4165832737669764, + "grad_norm": 0.5962988138198853, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0401, + "step": 14570 + }, + { + "epoch": 0.41686919228020014, + "grad_norm": 0.4028547406196594, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0371, + "step": 14580 + }, + { + "epoch": 0.4171551107934239, + "grad_norm": 1.348706841468811, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0426, + "step": 14590 + }, + { + "epoch": 0.4174410293066476, + "grad_norm": 1.2782070636749268, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0393, + "step": 14600 + }, + { + "epoch": 0.41772694781987135, + "grad_norm": 1.0024999380111694, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0436, + "step": 14610 + }, + { + "epoch": 0.4180128663330951, + "grad_norm": 0.35450127720832825, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0411, + "step": 14620 + }, + { + "epoch": 0.41829878484631877, + "grad_norm": 0.5827250480651855, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0372, + "step": 14630 + }, + { + "epoch": 0.4185847033595425, + "grad_norm": 0.5905774235725403, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0394, + "step": 14640 + }, + { + "epoch": 0.41887062187276625, + "grad_norm": 0.652074933052063, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0405, + "step": 14650 + }, + { + "epoch": 0.41915654038599, + "grad_norm": 0.7245490550994873, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0473, + "step": 14660 + }, + { + "epoch": 0.4194424588992137, + "grad_norm": 0.5153012871742249, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.043, + "step": 14670 + }, + { + "epoch": 0.41972837741243746, + "grad_norm": 0.516107976436615, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0434, + "step": 14680 + }, + { + "epoch": 0.4200142959256612, + "grad_norm": 0.4743354618549347, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0429, + "step": 14690 + }, + { + "epoch": 0.42030021443888493, + "grad_norm": 0.547875165939331, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0395, + "step": 14700 + }, + { + "epoch": 0.42058613295210867, + "grad_norm": 0.6398400068283081, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0384, + "step": 14710 + }, + { + "epoch": 0.42087205146533235, + "grad_norm": 0.5891467332839966, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0399, + "step": 14720 + }, + { + "epoch": 0.4211579699785561, + "grad_norm": 0.3927595615386963, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0353, + "step": 14730 + }, + { + "epoch": 0.42144388849177983, + "grad_norm": 0.6477030515670776, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0492, + "step": 14740 + }, + { + "epoch": 0.42172980700500357, + "grad_norm": 0.7090615034103394, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.042, + "step": 14750 + }, + { + "epoch": 0.4220157255182273, + "grad_norm": 0.6572134494781494, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0406, + "step": 14760 + }, + { + "epoch": 0.42230164403145104, + "grad_norm": 0.787663996219635, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0424, + "step": 14770 + }, + { + "epoch": 0.4225875625446748, + "grad_norm": 0.8419309258460999, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0427, + "step": 14780 + }, + { + "epoch": 0.4228734810578985, + "grad_norm": 0.6204128861427307, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0364, + "step": 14790 + }, + { + "epoch": 0.42315939957112225, + "grad_norm": 0.7446070313453674, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 0.42344531808434593, + "grad_norm": 0.7446451783180237, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0384, + "step": 14810 + }, + { + "epoch": 0.42373123659756967, + "grad_norm": 0.6946475505828857, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0375, + "step": 14820 + }, + { + "epoch": 0.4240171551107934, + "grad_norm": 0.6997008323669434, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0393, + "step": 14830 + }, + { + "epoch": 0.42430307362401715, + "grad_norm": 0.4857316315174103, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0474, + "step": 14840 + }, + { + "epoch": 0.4245889921372409, + "grad_norm": 1.3516888618469238, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.047, + "step": 14850 + }, + { + "epoch": 0.4248749106504646, + "grad_norm": 0.40320220589637756, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0418, + "step": 14860 + }, + { + "epoch": 0.42516082916368836, + "grad_norm": 0.9002796411514282, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0434, + "step": 14870 + }, + { + "epoch": 0.4254467476769121, + "grad_norm": 0.3810071349143982, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0338, + "step": 14880 + }, + { + "epoch": 0.42573266619013583, + "grad_norm": 0.5786157250404358, + "learning_rate": 1.159527607963768e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.4260185847033595, + "grad_norm": 0.6316869258880615, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0388, + "step": 14900 + }, + { + "epoch": 0.42630450321658325, + "grad_norm": 0.608745276927948, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0426, + "step": 14910 + }, + { + "epoch": 0.426590421729807, + "grad_norm": 0.6655036807060242, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0433, + "step": 14920 + }, + { + "epoch": 0.4268763402430307, + "grad_norm": 0.29059523344039917, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0507, + "step": 14930 + }, + { + "epoch": 0.42716225875625446, + "grad_norm": 0.9066076278686523, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.4274481772694782, + "grad_norm": 1.0660220384597778, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0512, + "step": 14950 + }, + { + "epoch": 0.42773409578270194, + "grad_norm": 0.6081144213676453, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0426, + "step": 14960 + }, + { + "epoch": 0.4280200142959257, + "grad_norm": 0.46524369716644287, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0435, + "step": 14970 + }, + { + "epoch": 0.4283059328091494, + "grad_norm": 0.3497388958930969, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0492, + "step": 14980 + }, + { + "epoch": 0.4285918513223731, + "grad_norm": 0.41300803422927856, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 0.42887776983559683, + "grad_norm": 0.4363289177417755, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0358, + "step": 15000 + }, + { + "epoch": 0.42916368834882057, + "grad_norm": 1.314915418624878, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.047, + "step": 15010 + }, + { + "epoch": 0.4294496068620443, + "grad_norm": 0.558199942111969, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0313, + "step": 15020 + }, + { + "epoch": 0.42973552537526805, + "grad_norm": 0.3857463598251343, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0416, + "step": 15030 + }, + { + "epoch": 0.4300214438884918, + "grad_norm": 0.4701749384403229, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0425, + "step": 15040 + }, + { + "epoch": 0.4303073624017155, + "grad_norm": 0.4611213803291321, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0457, + "step": 15050 + }, + { + "epoch": 0.43059328091493926, + "grad_norm": 0.5338016152381897, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.038, + "step": 15060 + }, + { + "epoch": 0.430879199428163, + "grad_norm": 0.9078943133354187, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0395, + "step": 15070 + }, + { + "epoch": 0.4311651179413867, + "grad_norm": 0.5354048013687134, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0403, + "step": 15080 + }, + { + "epoch": 0.4314510364546104, + "grad_norm": 0.35511279106140137, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0377, + "step": 15090 + }, + { + "epoch": 0.43173695496783415, + "grad_norm": 0.37104350328445435, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0426, + "step": 15100 + }, + { + "epoch": 0.4320228734810579, + "grad_norm": 0.8916210532188416, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0387, + "step": 15110 + }, + { + "epoch": 0.4323087919942816, + "grad_norm": 0.514994740486145, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0384, + "step": 15120 + }, + { + "epoch": 0.43259471050750536, + "grad_norm": 0.8440690040588379, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0437, + "step": 15130 + }, + { + "epoch": 0.4328806290207291, + "grad_norm": 0.6815949082374573, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0453, + "step": 15140 + }, + { + "epoch": 0.43316654753395284, + "grad_norm": 0.33178189396858215, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0351, + "step": 15150 + }, + { + "epoch": 0.4334524660471766, + "grad_norm": 0.5686727166175842, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0368, + "step": 15160 + }, + { + "epoch": 0.43373838456040026, + "grad_norm": 0.44143930077552795, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0443, + "step": 15170 + }, + { + "epoch": 0.434024303073624, + "grad_norm": 0.3238232135772705, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0348, + "step": 15180 + }, + { + "epoch": 0.43431022158684773, + "grad_norm": 0.5038242340087891, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0343, + "step": 15190 + }, + { + "epoch": 0.43459614010007147, + "grad_norm": 0.4904351234436035, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0397, + "step": 15200 + }, + { + "epoch": 0.4348820586132952, + "grad_norm": 0.5325750708580017, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0499, + "step": 15210 + }, + { + "epoch": 0.43516797712651895, + "grad_norm": 0.39443954825401306, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 0.4354538956397427, + "grad_norm": 0.6782003045082092, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0358, + "step": 15230 + }, + { + "epoch": 0.4357398141529664, + "grad_norm": 0.47862571477890015, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0418, + "step": 15240 + }, + { + "epoch": 0.43602573266619016, + "grad_norm": 1.6515535116195679, + "learning_rate": 1.124468908014616e-05, + "loss": 0.043, + "step": 15250 + }, + { + "epoch": 0.43631165117941384, + "grad_norm": 0.4902660846710205, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0371, + "step": 15260 + }, + { + "epoch": 0.4365975696926376, + "grad_norm": 0.5742762088775635, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0369, + "step": 15270 + }, + { + "epoch": 0.4368834882058613, + "grad_norm": 0.42058590054512024, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0378, + "step": 15280 + }, + { + "epoch": 0.43716940671908505, + "grad_norm": 0.43729284405708313, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0352, + "step": 15290 + }, + { + "epoch": 0.4374553252323088, + "grad_norm": 0.4689466953277588, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0433, + "step": 15300 + }, + { + "epoch": 0.4377412437455325, + "grad_norm": 0.6272432208061218, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0548, + "step": 15310 + }, + { + "epoch": 0.43802716225875626, + "grad_norm": 1.1129611730575562, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0437, + "step": 15320 + }, + { + "epoch": 0.43831308077198, + "grad_norm": 0.9332655072212219, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0503, + "step": 15330 + }, + { + "epoch": 0.43859899928520374, + "grad_norm": 0.35150477290153503, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0351, + "step": 15340 + }, + { + "epoch": 0.4388849177984274, + "grad_norm": 0.3826565444469452, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0361, + "step": 15350 + }, + { + "epoch": 0.43917083631165116, + "grad_norm": 0.817319393157959, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0352, + "step": 15360 + }, + { + "epoch": 0.4394567548248749, + "grad_norm": 0.4379598796367645, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0469, + "step": 15370 + }, + { + "epoch": 0.43974267333809863, + "grad_norm": 0.6475314497947693, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0456, + "step": 15380 + }, + { + "epoch": 0.44002859185132237, + "grad_norm": 0.529088020324707, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0453, + "step": 15390 + }, + { + "epoch": 0.4403145103645461, + "grad_norm": 0.4915194809436798, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0369, + "step": 15400 + }, + { + "epoch": 0.44060042887776985, + "grad_norm": 0.4766380786895752, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0391, + "step": 15410 + }, + { + "epoch": 0.4408863473909936, + "grad_norm": 0.34667786955833435, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0327, + "step": 15420 + }, + { + "epoch": 0.4411722659042173, + "grad_norm": 0.504242479801178, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0413, + "step": 15430 + }, + { + "epoch": 0.441458184417441, + "grad_norm": 0.49786439538002014, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0361, + "step": 15440 + }, + { + "epoch": 0.44174410293066474, + "grad_norm": 0.4997329115867615, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0368, + "step": 15450 + }, + { + "epoch": 0.4420300214438885, + "grad_norm": 0.2992185056209564, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0359, + "step": 15460 + }, + { + "epoch": 0.4423159399571122, + "grad_norm": 0.6645393371582031, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0401, + "step": 15470 + }, + { + "epoch": 0.44260185847033595, + "grad_norm": 0.6327983140945435, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0386, + "step": 15480 + }, + { + "epoch": 0.4428877769835597, + "grad_norm": 0.45607903599739075, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 0.4431736954967834, + "grad_norm": 0.4401610493659973, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0417, + "step": 15500 + }, + { + "epoch": 0.44345961401000716, + "grad_norm": 0.5778466463088989, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0417, + "step": 15510 + }, + { + "epoch": 0.4437455325232309, + "grad_norm": 0.2164914309978485, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0355, + "step": 15520 + }, + { + "epoch": 0.4440314510364546, + "grad_norm": 0.3869318664073944, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0361, + "step": 15530 + }, + { + "epoch": 0.4443173695496783, + "grad_norm": 0.3843154311180115, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0459, + "step": 15540 + }, + { + "epoch": 0.44460328806290206, + "grad_norm": 0.8488825559616089, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0406, + "step": 15550 + }, + { + "epoch": 0.4448892065761258, + "grad_norm": 0.5055183172225952, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0359, + "step": 15560 + }, + { + "epoch": 0.44517512508934953, + "grad_norm": 0.40923011302948, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0435, + "step": 15570 + }, + { + "epoch": 0.44546104360257327, + "grad_norm": 0.48997730016708374, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0395, + "step": 15580 + }, + { + "epoch": 0.445746962115797, + "grad_norm": 0.5149131417274475, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.041, + "step": 15590 + }, + { + "epoch": 0.44603288062902074, + "grad_norm": 0.7277303338050842, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0452, + "step": 15600 + }, + { + "epoch": 0.4463187991422445, + "grad_norm": 0.48676377534866333, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0363, + "step": 15610 + }, + { + "epoch": 0.44660471765546816, + "grad_norm": 0.49031221866607666, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0356, + "step": 15620 + }, + { + "epoch": 0.4468906361686919, + "grad_norm": 0.38877514004707336, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.036, + "step": 15630 + }, + { + "epoch": 0.44717655468191564, + "grad_norm": 0.570068895816803, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0403, + "step": 15640 + }, + { + "epoch": 0.4474624731951394, + "grad_norm": 0.48499882221221924, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0395, + "step": 15650 + }, + { + "epoch": 0.4477483917083631, + "grad_norm": 0.7251732349395752, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0399, + "step": 15660 + }, + { + "epoch": 0.44803431022158685, + "grad_norm": 0.3927334249019623, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0359, + "step": 15670 + }, + { + "epoch": 0.4483202287348106, + "grad_norm": 0.5614549517631531, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.035, + "step": 15680 + }, + { + "epoch": 0.4486061472480343, + "grad_norm": 0.383831262588501, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0416, + "step": 15690 + }, + { + "epoch": 0.44889206576125806, + "grad_norm": 1.9365276098251343, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0498, + "step": 15700 + }, + { + "epoch": 0.44917798427448175, + "grad_norm": 0.6964924931526184, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.034, + "step": 15710 + }, + { + "epoch": 0.4494639027877055, + "grad_norm": 0.5148108601570129, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0401, + "step": 15720 + }, + { + "epoch": 0.4497498213009292, + "grad_norm": 0.4529317617416382, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0361, + "step": 15730 + }, + { + "epoch": 0.45003573981415296, + "grad_norm": 0.6648512482643127, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0365, + "step": 15740 + }, + { + "epoch": 0.4503216583273767, + "grad_norm": 0.8183113932609558, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0416, + "step": 15750 + }, + { + "epoch": 0.45060757684060043, + "grad_norm": 0.8802638649940491, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0406, + "step": 15760 + }, + { + "epoch": 0.45089349535382417, + "grad_norm": 0.6329004764556885, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0395, + "step": 15770 + }, + { + "epoch": 0.4511794138670479, + "grad_norm": 0.35283520817756653, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0364, + "step": 15780 + }, + { + "epoch": 0.45146533238027164, + "grad_norm": 0.5156061053276062, + "learning_rate": 1.071827766589186e-05, + "loss": 0.031, + "step": 15790 + }, + { + "epoch": 0.4517512508934953, + "grad_norm": 0.37875205278396606, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0375, + "step": 15800 + }, + { + "epoch": 0.45203716940671906, + "grad_norm": 0.5543273687362671, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0421, + "step": 15810 + }, + { + "epoch": 0.4523230879199428, + "grad_norm": 0.3808431923389435, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 0.45260900643316654, + "grad_norm": 0.8648643493652344, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0396, + "step": 15830 + }, + { + "epoch": 0.4528949249463903, + "grad_norm": 0.7893536686897278, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0417, + "step": 15840 + }, + { + "epoch": 0.453180843459614, + "grad_norm": 0.904137134552002, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0384, + "step": 15850 + }, + { + "epoch": 0.45346676197283775, + "grad_norm": 0.6095889806747437, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0457, + "step": 15860 + }, + { + "epoch": 0.4537526804860615, + "grad_norm": 0.5691415667533875, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0438, + "step": 15870 + }, + { + "epoch": 0.4540385989992852, + "grad_norm": 0.37868618965148926, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0414, + "step": 15880 + }, + { + "epoch": 0.4543245175125089, + "grad_norm": 0.7962950468063354, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0405, + "step": 15890 + }, + { + "epoch": 0.45461043602573264, + "grad_norm": 0.8862378597259521, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0475, + "step": 15900 + }, + { + "epoch": 0.4548963545389564, + "grad_norm": 0.8762509822845459, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0472, + "step": 15910 + }, + { + "epoch": 0.4551822730521801, + "grad_norm": 0.6006313562393188, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0417, + "step": 15920 + }, + { + "epoch": 0.45546819156540386, + "grad_norm": 0.3340131938457489, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0374, + "step": 15930 + }, + { + "epoch": 0.4557541100786276, + "grad_norm": 0.2639552056789398, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0387, + "step": 15940 + }, + { + "epoch": 0.45604002859185133, + "grad_norm": 0.42564907670021057, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0376, + "step": 15950 + }, + { + "epoch": 0.45632594710507507, + "grad_norm": 0.503834068775177, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0344, + "step": 15960 + }, + { + "epoch": 0.4566118656182988, + "grad_norm": 0.5962334871292114, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0379, + "step": 15970 + }, + { + "epoch": 0.4568977841315225, + "grad_norm": 0.3271556794643402, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0361, + "step": 15980 + }, + { + "epoch": 0.4571837026447462, + "grad_norm": 0.5501612424850464, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0356, + "step": 15990 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 1.0399914979934692, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.039, + "step": 16000 + }, + { + "epoch": 0.4577555396711937, + "grad_norm": 0.42251288890838623, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0413, + "step": 16010 + }, + { + "epoch": 0.45804145818441744, + "grad_norm": 0.5694882869720459, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0501, + "step": 16020 + }, + { + "epoch": 0.4583273766976412, + "grad_norm": 0.37367814779281616, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0388, + "step": 16030 + }, + { + "epoch": 0.4586132952108649, + "grad_norm": 0.7947224974632263, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0324, + "step": 16040 + }, + { + "epoch": 0.45889921372408865, + "grad_norm": 0.47871798276901245, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0345, + "step": 16050 + }, + { + "epoch": 0.4591851322373124, + "grad_norm": 1.4443609714508057, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0502, + "step": 16060 + }, + { + "epoch": 0.45947105075053607, + "grad_norm": 0.8326191902160645, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0325, + "step": 16070 + }, + { + "epoch": 0.4597569692637598, + "grad_norm": 0.2887400686740875, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.035, + "step": 16080 + }, + { + "epoch": 0.46004288777698354, + "grad_norm": 0.34353405237197876, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0324, + "step": 16090 + }, + { + "epoch": 0.4603288062902073, + "grad_norm": 0.7319850325584412, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0307, + "step": 16100 + }, + { + "epoch": 0.460614724803431, + "grad_norm": 0.6628556847572327, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0398, + "step": 16110 + }, + { + "epoch": 0.46090064331665476, + "grad_norm": 0.39974722266197205, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.038, + "step": 16120 + }, + { + "epoch": 0.4611865618298785, + "grad_norm": 0.7769339680671692, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0425, + "step": 16130 + }, + { + "epoch": 0.46147248034310223, + "grad_norm": 0.6823691129684448, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.039, + "step": 16140 + }, + { + "epoch": 0.46175839885632597, + "grad_norm": 0.6749460697174072, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0388, + "step": 16150 + }, + { + "epoch": 0.46204431736954965, + "grad_norm": 1.0745635032653809, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0406, + "step": 16160 + }, + { + "epoch": 0.4623302358827734, + "grad_norm": 0.8388734459877014, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0345, + "step": 16170 + }, + { + "epoch": 0.4626161543959971, + "grad_norm": 0.675828218460083, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0355, + "step": 16180 + }, + { + "epoch": 0.46290207290922086, + "grad_norm": 0.9872504472732544, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0374, + "step": 16190 + }, + { + "epoch": 0.4631879914224446, + "grad_norm": 0.4705125689506531, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0416, + "step": 16200 + }, + { + "epoch": 0.46347390993566834, + "grad_norm": 0.43577539920806885, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.041, + "step": 16210 + }, + { + "epoch": 0.4637598284488921, + "grad_norm": 0.6472166180610657, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0372, + "step": 16220 + }, + { + "epoch": 0.4640457469621158, + "grad_norm": 1.0108906030654907, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0464, + "step": 16230 + }, + { + "epoch": 0.46433166547533955, + "grad_norm": 0.6221884489059448, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0396, + "step": 16240 + }, + { + "epoch": 0.46461758398856323, + "grad_norm": 0.7375202178955078, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0365, + "step": 16250 + }, + { + "epoch": 0.46490350250178697, + "grad_norm": 0.5090222358703613, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0404, + "step": 16260 + }, + { + "epoch": 0.4651894210150107, + "grad_norm": 0.5641722679138184, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0424, + "step": 16270 + }, + { + "epoch": 0.46547533952823444, + "grad_norm": 0.3946240246295929, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0433, + "step": 16280 + }, + { + "epoch": 0.4657612580414582, + "grad_norm": 0.525059700012207, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0399, + "step": 16290 + }, + { + "epoch": 0.4660471765546819, + "grad_norm": 0.6106441617012024, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0417, + "step": 16300 + }, + { + "epoch": 0.46633309506790566, + "grad_norm": 0.7064299583435059, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0331, + "step": 16310 + }, + { + "epoch": 0.4666190135811294, + "grad_norm": 0.6251654624938965, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0377, + "step": 16320 + }, + { + "epoch": 0.46690493209435313, + "grad_norm": 0.6626482009887695, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0355, + "step": 16330 + }, + { + "epoch": 0.4671908506075768, + "grad_norm": 0.32827794551849365, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0438, + "step": 16340 + }, + { + "epoch": 0.46747676912080055, + "grad_norm": 1.147644281387329, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.041, + "step": 16350 + }, + { + "epoch": 0.4677626876340243, + "grad_norm": 0.5785626769065857, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0362, + "step": 16360 + }, + { + "epoch": 0.468048606147248, + "grad_norm": 0.7087936401367188, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0364, + "step": 16370 + }, + { + "epoch": 0.46833452466047176, + "grad_norm": 0.7729533314704895, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0357, + "step": 16380 + }, + { + "epoch": 0.4686204431736955, + "grad_norm": 0.9080077409744263, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0445, + "step": 16390 + }, + { + "epoch": 0.46890636168691924, + "grad_norm": 0.5273067355155945, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0395, + "step": 16400 + }, + { + "epoch": 0.469192280200143, + "grad_norm": 0.4801991581916809, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0469, + "step": 16410 + }, + { + "epoch": 0.4694781987133667, + "grad_norm": 0.38060688972473145, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0377, + "step": 16420 + }, + { + "epoch": 0.4697641172265904, + "grad_norm": 1.335648536682129, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0444, + "step": 16430 + }, + { + "epoch": 0.47005003573981413, + "grad_norm": 0.6224690079689026, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0365, + "step": 16440 + }, + { + "epoch": 0.47033595425303787, + "grad_norm": 0.39938899874687195, + "learning_rate": 1.007637577910799e-05, + "loss": 0.037, + "step": 16450 + }, + { + "epoch": 0.4706218727662616, + "grad_norm": 0.47899872064590454, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0371, + "step": 16460 + }, + { + "epoch": 0.47090779127948534, + "grad_norm": 0.8991144895553589, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0337, + "step": 16470 + }, + { + "epoch": 0.4711937097927091, + "grad_norm": 0.6228598356246948, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0388, + "step": 16480 + }, + { + "epoch": 0.4714796283059328, + "grad_norm": 0.41108259558677673, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0378, + "step": 16490 + }, + { + "epoch": 0.47176554681915656, + "grad_norm": 0.722955048084259, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0381, + "step": 16500 + }, + { + "epoch": 0.4720514653323803, + "grad_norm": 0.6090973019599915, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0348, + "step": 16510 + }, + { + "epoch": 0.472337383845604, + "grad_norm": 0.483549565076828, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0456, + "step": 16520 + }, + { + "epoch": 0.4726233023588277, + "grad_norm": 0.4134727418422699, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0444, + "step": 16530 + }, + { + "epoch": 0.47290922087205145, + "grad_norm": 0.4629753530025482, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0382, + "step": 16540 + }, + { + "epoch": 0.4731951393852752, + "grad_norm": 0.8709504008293152, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0384, + "step": 16550 + }, + { + "epoch": 0.4734810578984989, + "grad_norm": 0.683397114276886, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0398, + "step": 16560 + }, + { + "epoch": 0.47376697641172266, + "grad_norm": 0.5743465423583984, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0431, + "step": 16570 + }, + { + "epoch": 0.4740528949249464, + "grad_norm": 1.0080480575561523, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0378, + "step": 16580 + }, + { + "epoch": 0.47433881343817014, + "grad_norm": 0.4668700098991394, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0369, + "step": 16590 + }, + { + "epoch": 0.4746247319513939, + "grad_norm": 0.6005896925926208, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0508, + "step": 16600 + }, + { + "epoch": 0.47491065046461756, + "grad_norm": 0.5788530707359314, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0354, + "step": 16610 + }, + { + "epoch": 0.4751965689778413, + "grad_norm": 0.38784441351890564, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0357, + "step": 16620 + }, + { + "epoch": 0.47548248749106503, + "grad_norm": 0.4809567928314209, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0331, + "step": 16630 + }, + { + "epoch": 0.47576840600428877, + "grad_norm": 0.6647809147834778, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0473, + "step": 16640 + }, + { + "epoch": 0.4760543245175125, + "grad_norm": 0.3968522548675537, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0304, + "step": 16650 + }, + { + "epoch": 0.47634024303073624, + "grad_norm": 0.3258526027202606, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0387, + "step": 16660 + }, + { + "epoch": 0.47662616154396, + "grad_norm": 0.43442079424858093, + "learning_rate": 9.863295834019308e-06, + "loss": 0.04, + "step": 16670 + }, + { + "epoch": 0.4769120800571837, + "grad_norm": 0.36909565329551697, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0351, + "step": 16680 + }, + { + "epoch": 0.47719799857040746, + "grad_norm": 0.5566768050193787, + "learning_rate": 9.843955128197274e-06, + "loss": 0.031, + "step": 16690 + }, + { + "epoch": 0.47748391708363114, + "grad_norm": 0.5705142617225647, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0359, + "step": 16700 + }, + { + "epoch": 0.4777698355968549, + "grad_norm": 0.28931716084480286, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0407, + "step": 16710 + }, + { + "epoch": 0.4780557541100786, + "grad_norm": 0.5509498715400696, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0363, + "step": 16720 + }, + { + "epoch": 0.47834167262330235, + "grad_norm": 0.3564346432685852, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0364, + "step": 16730 + }, + { + "epoch": 0.4786275911365261, + "grad_norm": 0.32734423875808716, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0369, + "step": 16740 + }, + { + "epoch": 0.4789135096497498, + "grad_norm": 0.3048594892024994, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0367, + "step": 16750 + }, + { + "epoch": 0.47919942816297356, + "grad_norm": 0.9007049798965454, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0377, + "step": 16760 + }, + { + "epoch": 0.4794853466761973, + "grad_norm": 0.7010983824729919, + "learning_rate": 9.76664747972605e-06, + "loss": 0.039, + "step": 16770 + }, + { + "epoch": 0.47977126518942104, + "grad_norm": 0.644473135471344, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0466, + "step": 16780 + }, + { + "epoch": 0.4800571837026447, + "grad_norm": 0.6333492398262024, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0373, + "step": 16790 + }, + { + "epoch": 0.48034310221586846, + "grad_norm": 0.5148355960845947, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0392, + "step": 16800 + }, + { + "epoch": 0.4806290207290922, + "grad_norm": 0.7288355231285095, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0381, + "step": 16810 + }, + { + "epoch": 0.48091493924231593, + "grad_norm": 0.3674873113632202, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0418, + "step": 16820 + }, + { + "epoch": 0.48120085775553967, + "grad_norm": 0.5055420398712158, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0336, + "step": 16830 + }, + { + "epoch": 0.4814867762687634, + "grad_norm": 0.641754686832428, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0342, + "step": 16840 + }, + { + "epoch": 0.48177269478198714, + "grad_norm": 0.308200478553772, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0364, + "step": 16850 + }, + { + "epoch": 0.4820586132952109, + "grad_norm": 0.41361021995544434, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0342, + "step": 16860 + }, + { + "epoch": 0.4823445318084346, + "grad_norm": 0.45777833461761475, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0353, + "step": 16870 + }, + { + "epoch": 0.4826304503216583, + "grad_norm": 0.7587664723396301, + "learning_rate": 9.660501900166734e-06, + "loss": 0.043, + "step": 16880 + }, + { + "epoch": 0.48291636883488204, + "grad_norm": 0.8740283250808716, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0372, + "step": 16890 + }, + { + "epoch": 0.4832022873481058, + "grad_norm": 0.3009270429611206, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0373, + "step": 16900 + }, + { + "epoch": 0.4834882058613295, + "grad_norm": 0.4439285695552826, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0349, + "step": 16910 + }, + { + "epoch": 0.48377412437455325, + "grad_norm": 0.39849671721458435, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0394, + "step": 16920 + }, + { + "epoch": 0.484060042887777, + "grad_norm": 0.6423043608665466, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0413, + "step": 16930 + }, + { + "epoch": 0.4843459614010007, + "grad_norm": 0.3683928847312927, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0387, + "step": 16940 + }, + { + "epoch": 0.48463187991422446, + "grad_norm": 0.7087769508361816, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0397, + "step": 16950 + }, + { + "epoch": 0.4849177984274482, + "grad_norm": 0.5348120927810669, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0405, + "step": 16960 + }, + { + "epoch": 0.4852037169406719, + "grad_norm": 0.549891471862793, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0363, + "step": 16970 + }, + { + "epoch": 0.4854896354538956, + "grad_norm": 0.7177272439002991, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0343, + "step": 16980 + }, + { + "epoch": 0.48577555396711936, + "grad_norm": 0.595417320728302, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0439, + "step": 16990 + }, + { + "epoch": 0.4860614724803431, + "grad_norm": 0.4838889241218567, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0387, + "step": 17000 + }, + { + "epoch": 0.48634739099356683, + "grad_norm": 0.6186223030090332, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0362, + "step": 17010 + }, + { + "epoch": 0.48663330950679057, + "grad_norm": 0.43383121490478516, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0381, + "step": 17020 + }, + { + "epoch": 0.4869192280200143, + "grad_norm": 0.6735527515411377, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0388, + "step": 17030 + }, + { + "epoch": 0.48720514653323804, + "grad_norm": 0.3746320605278015, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0491, + "step": 17040 + }, + { + "epoch": 0.4874910650464618, + "grad_norm": 0.29500988125801086, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0395, + "step": 17050 + }, + { + "epoch": 0.48777698355968546, + "grad_norm": 0.8518465757369995, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0435, + "step": 17060 + }, + { + "epoch": 0.4880629020729092, + "grad_norm": 0.9653190970420837, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0393, + "step": 17070 + }, + { + "epoch": 0.48834882058613294, + "grad_norm": 0.785724937915802, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0372, + "step": 17080 + }, + { + "epoch": 0.4886347390993567, + "grad_norm": 0.9450638890266418, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0406, + "step": 17090 + }, + { + "epoch": 0.4889206576125804, + "grad_norm": 0.645124077796936, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0361, + "step": 17100 + }, + { + "epoch": 0.48920657612580415, + "grad_norm": 0.3352372944355011, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0417, + "step": 17110 + }, + { + "epoch": 0.4894924946390279, + "grad_norm": 0.3858814835548401, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0345, + "step": 17120 + }, + { + "epoch": 0.4897784131522516, + "grad_norm": 0.5403604507446289, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0326, + "step": 17130 + }, + { + "epoch": 0.49006433166547536, + "grad_norm": 0.6986777782440186, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0417, + "step": 17140 + }, + { + "epoch": 0.49035025017869904, + "grad_norm": 0.5456675887107849, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0473, + "step": 17150 + }, + { + "epoch": 0.4906361686919228, + "grad_norm": 0.3961554765701294, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0341, + "step": 17160 + }, + { + "epoch": 0.4909220872051465, + "grad_norm": 0.5188277363777161, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0369, + "step": 17170 + }, + { + "epoch": 0.49120800571837026, + "grad_norm": 0.6042230725288391, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0352, + "step": 17180 + }, + { + "epoch": 0.491493924231594, + "grad_norm": 0.5485941171646118, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0405, + "step": 17190 + }, + { + "epoch": 0.49177984274481773, + "grad_norm": 0.5856509804725647, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0402, + "step": 17200 + }, + { + "epoch": 0.49206576125804147, + "grad_norm": 0.8656556010246277, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0349, + "step": 17210 + }, + { + "epoch": 0.4923516797712652, + "grad_norm": 0.4041757583618164, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0364, + "step": 17220 + }, + { + "epoch": 0.49263759828448894, + "grad_norm": 0.6135975122451782, + "learning_rate": 9.324104146177972e-06, + "loss": 0.036, + "step": 17230 + }, + { + "epoch": 0.4929235167977126, + "grad_norm": 0.5101860165596008, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0359, + "step": 17240 + }, + { + "epoch": 0.49320943531093636, + "grad_norm": 0.9913426041603088, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0552, + "step": 17250 + }, + { + "epoch": 0.4934953538241601, + "grad_norm": 0.6148158311843872, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0388, + "step": 17260 + }, + { + "epoch": 0.49378127233738384, + "grad_norm": 0.6651721596717834, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0374, + "step": 17270 + }, + { + "epoch": 0.4940671908506076, + "grad_norm": 0.9545061588287354, + "learning_rate": 9.276232738281744e-06, + "loss": 0.035, + "step": 17280 + }, + { + "epoch": 0.4943531093638313, + "grad_norm": 0.8923225402832031, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0366, + "step": 17290 + }, + { + "epoch": 0.49463902787705505, + "grad_norm": 0.5337848663330078, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0354, + "step": 17300 + }, + { + "epoch": 0.4949249463902788, + "grad_norm": 0.35039281845092773, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0341, + "step": 17310 + }, + { + "epoch": 0.4952108649035025, + "grad_norm": 0.47406911849975586, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0393, + "step": 17320 + }, + { + "epoch": 0.4954967834167262, + "grad_norm": 0.6226631999015808, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0375, + "step": 17330 + }, + { + "epoch": 0.49578270192994994, + "grad_norm": 0.6652712821960449, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0363, + "step": 17340 + }, + { + "epoch": 0.4960686204431737, + "grad_norm": 1.0042835474014282, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0368, + "step": 17350 + }, + { + "epoch": 0.4963545389563974, + "grad_norm": 0.4334045648574829, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0375, + "step": 17360 + }, + { + "epoch": 0.49664045746962115, + "grad_norm": 0.3561633229255676, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0347, + "step": 17370 + }, + { + "epoch": 0.4969263759828449, + "grad_norm": 0.5763550996780396, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0344, + "step": 17380 + }, + { + "epoch": 0.49721229449606863, + "grad_norm": 0.6306643486022949, + "learning_rate": 9.171095634265995e-06, + "loss": 0.037, + "step": 17390 + }, + { + "epoch": 0.49749821300929237, + "grad_norm": 0.4286569058895111, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0308, + "step": 17400 + }, + { + "epoch": 0.4977841315225161, + "grad_norm": 0.577983558177948, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0393, + "step": 17410 + }, + { + "epoch": 0.4980700500357398, + "grad_norm": 0.5714932084083557, + "learning_rate": 9.142466323573853e-06, + "loss": 0.038, + "step": 17420 + }, + { + "epoch": 0.4983559685489635, + "grad_norm": 0.7529498338699341, + "learning_rate": 9.132927564918328e-06, + "loss": 0.033, + "step": 17430 + }, + { + "epoch": 0.49864188706218726, + "grad_norm": 0.5179672241210938, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0367, + "step": 17440 + }, + { + "epoch": 0.498927805575411, + "grad_norm": 0.38424569368362427, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0401, + "step": 17450 + }, + { + "epoch": 0.49921372408863474, + "grad_norm": 0.469460129737854, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0379, + "step": 17460 + }, + { + "epoch": 0.4994996426018585, + "grad_norm": 0.3285387456417084, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0399, + "step": 17470 + }, + { + "epoch": 0.4997855611150822, + "grad_norm": 0.49863550066947937, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0313, + "step": 17480 + }, + { + "epoch": 0.5000714796283059, + "grad_norm": 0.3926186263561249, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0454, + "step": 17490 + }, + { + "epoch": 0.5003573981415297, + "grad_norm": 0.4476146399974823, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0472, + "step": 17500 + }, + { + "epoch": 0.5006433166547534, + "grad_norm": 0.5645599961280823, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0358, + "step": 17510 + }, + { + "epoch": 0.5009292351679772, + "grad_norm": 0.4813307225704193, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0364, + "step": 17520 + }, + { + "epoch": 0.5012151536812008, + "grad_norm": 0.49410971999168396, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0303, + "step": 17530 + }, + { + "epoch": 0.5015010721944246, + "grad_norm": 0.7172105312347412, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0404, + "step": 17540 + }, + { + "epoch": 0.5017869907076483, + "grad_norm": 0.43401873111724854, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0402, + "step": 17550 + }, + { + "epoch": 0.502072909220872, + "grad_norm": 0.6497406363487244, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0364, + "step": 17560 + }, + { + "epoch": 0.5023588277340958, + "grad_norm": 0.44618356227874756, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0337, + "step": 17570 + }, + { + "epoch": 0.5026447462473195, + "grad_norm": 0.4186992049217224, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0381, + "step": 17580 + }, + { + "epoch": 0.5029306647605433, + "grad_norm": 0.7387974858283997, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0319, + "step": 17590 + }, + { + "epoch": 0.503216583273767, + "grad_norm": 0.8068642020225525, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0373, + "step": 17600 + }, + { + "epoch": 0.5035025017869907, + "grad_norm": 0.5773473978042603, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0372, + "step": 17610 + }, + { + "epoch": 0.5037884203002144, + "grad_norm": 0.32488778233528137, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0334, + "step": 17620 + }, + { + "epoch": 0.5040743388134382, + "grad_norm": 0.33978500962257385, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0493, + "step": 17630 + }, + { + "epoch": 0.5043602573266619, + "grad_norm": 0.5897071361541748, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0335, + "step": 17640 + }, + { + "epoch": 0.5046461758398856, + "grad_norm": 0.6275895833969116, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0395, + "step": 17650 + }, + { + "epoch": 0.5049320943531094, + "grad_norm": 0.7995536923408508, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0422, + "step": 17660 + }, + { + "epoch": 0.505218012866333, + "grad_norm": 0.8734716773033142, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0414, + "step": 17670 + }, + { + "epoch": 0.5055039313795568, + "grad_norm": 0.6239343881607056, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0333, + "step": 17680 + }, + { + "epoch": 0.5057898498927805, + "grad_norm": 0.42508623003959656, + "learning_rate": 8.885721609997551e-06, + "loss": 0.045, + "step": 17690 + }, + { + "epoch": 0.5060757684060043, + "grad_norm": 0.4272485673427582, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0506, + "step": 17700 + }, + { + "epoch": 0.506361686919228, + "grad_norm": 0.8006368279457092, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0431, + "step": 17710 + }, + { + "epoch": 0.5066476054324518, + "grad_norm": 0.5896835327148438, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0322, + "step": 17720 + }, + { + "epoch": 0.5069335239456755, + "grad_norm": 0.6880389451980591, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0322, + "step": 17730 + }, + { + "epoch": 0.5072194424588992, + "grad_norm": 1.4850202798843384, + "learning_rate": 8.83836825410936e-06, + "loss": 0.052, + "step": 17740 + }, + { + "epoch": 0.507505360972123, + "grad_norm": 0.7684240937232971, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0353, + "step": 17750 + }, + { + "epoch": 0.5077912794853466, + "grad_norm": 0.5456307530403137, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0419, + "step": 17760 + }, + { + "epoch": 0.5080771979985704, + "grad_norm": 0.5775120258331299, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0366, + "step": 17770 + }, + { + "epoch": 0.5083631165117941, + "grad_norm": 0.6453070044517517, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0341, + "step": 17780 + }, + { + "epoch": 0.5086490350250179, + "grad_norm": 0.7906973361968994, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0405, + "step": 17790 + }, + { + "epoch": 0.5089349535382416, + "grad_norm": 1.0740606784820557, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0344, + "step": 17800 + }, + { + "epoch": 0.5092208720514654, + "grad_norm": 0.41854357719421387, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0334, + "step": 17810 + }, + { + "epoch": 0.5095067905646891, + "grad_norm": 0.6328964233398438, + "learning_rate": 8.762735374981932e-06, + "loss": 0.036, + "step": 17820 + }, + { + "epoch": 0.5097927090779127, + "grad_norm": 0.40875789523124695, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0338, + "step": 17830 + }, + { + "epoch": 0.5100786275911365, + "grad_norm": 0.5056312084197998, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0332, + "step": 17840 + }, + { + "epoch": 0.5103645461043602, + "grad_norm": 0.5005037784576416, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0416, + "step": 17850 + }, + { + "epoch": 0.510650464617584, + "grad_norm": 0.5689167380332947, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0329, + "step": 17860 + }, + { + "epoch": 0.5109363831308077, + "grad_norm": 0.5222717523574829, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0336, + "step": 17870 + }, + { + "epoch": 0.5112223016440315, + "grad_norm": 0.5998329520225525, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0354, + "step": 17880 + }, + { + "epoch": 0.5115082201572552, + "grad_norm": 0.4684480130672455, + "learning_rate": 8.69669425266315e-06, + "loss": 0.05, + "step": 17890 + }, + { + "epoch": 0.511794138670479, + "grad_norm": 0.4061124622821808, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0384, + "step": 17900 + }, + { + "epoch": 0.5120800571837026, + "grad_norm": 0.5025928020477295, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0386, + "step": 17910 + }, + { + "epoch": 0.5123659756969263, + "grad_norm": 0.3731222152709961, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0378, + "step": 17920 + }, + { + "epoch": 0.5126518942101501, + "grad_norm": 0.7784973978996277, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0419, + "step": 17930 + }, + { + "epoch": 0.5129378127233738, + "grad_norm": 0.7074074745178223, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0386, + "step": 17940 + }, + { + "epoch": 0.5132237312365976, + "grad_norm": 0.49802306294441223, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0418, + "step": 17950 + }, + { + "epoch": 0.5135096497498213, + "grad_norm": 0.4355427920818329, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0431, + "step": 17960 + }, + { + "epoch": 0.5137955682630451, + "grad_norm": 0.672635555267334, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0403, + "step": 17970 + }, + { + "epoch": 0.5140814867762687, + "grad_norm": 0.6733908653259277, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0487, + "step": 17980 + }, + { + "epoch": 0.5143674052894925, + "grad_norm": 0.43711504340171814, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0378, + "step": 17990 + }, + { + "epoch": 0.5146533238027162, + "grad_norm": 0.6371222138404846, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0336, + "step": 18000 + }, + { + "epoch": 0.5149392423159399, + "grad_norm": 0.8007041811943054, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0371, + "step": 18010 + }, + { + "epoch": 0.5152251608291637, + "grad_norm": 0.4725078344345093, + "learning_rate": 8.574400723012433e-06, + "loss": 0.037, + "step": 18020 + }, + { + "epoch": 0.5155110793423874, + "grad_norm": 0.34229791164398193, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0353, + "step": 18030 + }, + { + "epoch": 0.5157969978556112, + "grad_norm": 0.27863454818725586, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0371, + "step": 18040 + }, + { + "epoch": 0.5160829163688349, + "grad_norm": 0.43021920323371887, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0419, + "step": 18050 + }, + { + "epoch": 0.5163688348820586, + "grad_norm": 0.4683758318424225, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0307, + "step": 18060 + }, + { + "epoch": 0.5166547533952823, + "grad_norm": 0.29085367918014526, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0372, + "step": 18070 + }, + { + "epoch": 0.5169406719085061, + "grad_norm": 0.4396727681159973, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0328, + "step": 18080 + }, + { + "epoch": 0.5172265904217298, + "grad_norm": 0.539021372795105, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0317, + "step": 18090 + }, + { + "epoch": 0.5175125089349535, + "grad_norm": 0.556974470615387, + "learning_rate": 8.499380733111628e-06, + "loss": 0.037, + "step": 18100 + }, + { + "epoch": 0.5177984274481773, + "grad_norm": 0.4445747137069702, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0344, + "step": 18110 + }, + { + "epoch": 0.518084345961401, + "grad_norm": 0.3742713928222656, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0339, + "step": 18120 + }, + { + "epoch": 0.5183702644746248, + "grad_norm": 0.8467416167259216, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0409, + "step": 18130 + }, + { + "epoch": 0.5186561829878484, + "grad_norm": 0.7731484770774841, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0379, + "step": 18140 + }, + { + "epoch": 0.5189421015010722, + "grad_norm": 0.5664084553718567, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0353, + "step": 18150 + }, + { + "epoch": 0.5192280200142959, + "grad_norm": 0.5623966455459595, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0412, + "step": 18160 + }, + { + "epoch": 0.5195139385275197, + "grad_norm": 0.5074556469917297, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0402, + "step": 18170 + }, + { + "epoch": 0.5197998570407434, + "grad_norm": 0.49439728260040283, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0341, + "step": 18180 + }, + { + "epoch": 0.5200857755539671, + "grad_norm": 0.5982527136802673, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0308, + "step": 18190 + }, + { + "epoch": 0.5203716940671909, + "grad_norm": 0.7891598343849182, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0437, + "step": 18200 + }, + { + "epoch": 0.5206576125804145, + "grad_norm": 0.7565666437149048, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0381, + "step": 18210 + }, + { + "epoch": 0.5209435310936383, + "grad_norm": 0.33346351981163025, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0454, + "step": 18220 + }, + { + "epoch": 0.521229449606862, + "grad_norm": 0.5885659456253052, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0413, + "step": 18230 + }, + { + "epoch": 0.5215153681200858, + "grad_norm": 0.6487091183662415, + "learning_rate": 8.368551060444755e-06, + "loss": 0.035, + "step": 18240 + }, + { + "epoch": 0.5218012866333095, + "grad_norm": 0.9817430377006531, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0394, + "step": 18250 + }, + { + "epoch": 0.5220872051465333, + "grad_norm": 0.5691193342208862, + "learning_rate": 8.349909816537207e-06, + "loss": 0.041, + "step": 18260 + }, + { + "epoch": 0.522373123659757, + "grad_norm": 0.5326661467552185, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0361, + "step": 18270 + }, + { + "epoch": 0.5226590421729806, + "grad_norm": 0.5536142587661743, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0406, + "step": 18280 + }, + { + "epoch": 0.5229449606862044, + "grad_norm": 0.3482394218444824, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0423, + "step": 18290 + }, + { + "epoch": 0.5232308791994281, + "grad_norm": 0.514914333820343, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0352, + "step": 18300 + }, + { + "epoch": 0.5235167977126519, + "grad_norm": 0.7681404948234558, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0386, + "step": 18310 + }, + { + "epoch": 0.5238027162258756, + "grad_norm": 0.400426983833313, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0333, + "step": 18320 + }, + { + "epoch": 0.5240886347390994, + "grad_norm": 0.4996081590652466, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0381, + "step": 18330 + }, + { + "epoch": 0.5243745532523231, + "grad_norm": 0.5379085540771484, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0348, + "step": 18340 + }, + { + "epoch": 0.5246604717655469, + "grad_norm": 0.4462053179740906, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0307, + "step": 18350 + }, + { + "epoch": 0.5249463902787705, + "grad_norm": 0.7336096167564392, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0345, + "step": 18360 + }, + { + "epoch": 0.5252323087919942, + "grad_norm": 0.6676360368728638, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0346, + "step": 18370 + }, + { + "epoch": 0.525518227305218, + "grad_norm": 0.46608656644821167, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0334, + "step": 18380 + }, + { + "epoch": 0.5258041458184417, + "grad_norm": 0.4906940460205078, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0331, + "step": 18390 + }, + { + "epoch": 0.5260900643316655, + "grad_norm": 0.4200032353401184, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0394, + "step": 18400 + }, + { + "epoch": 0.5263759828448892, + "grad_norm": 0.5663877725601196, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0349, + "step": 18410 + }, + { + "epoch": 0.526661901358113, + "grad_norm": 0.36824384331703186, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0303, + "step": 18420 + }, + { + "epoch": 0.5269478198713367, + "grad_norm": 0.8120076060295105, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0443, + "step": 18430 + }, + { + "epoch": 0.5272337383845604, + "grad_norm": 0.4102472960948944, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0369, + "step": 18440 + }, + { + "epoch": 0.5275196568977841, + "grad_norm": 0.5186526775360107, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0338, + "step": 18450 + }, + { + "epoch": 0.5278055754110078, + "grad_norm": 0.9650108218193054, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0343, + "step": 18460 + }, + { + "epoch": 0.5280914939242316, + "grad_norm": 0.5894375443458557, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0416, + "step": 18470 + }, + { + "epoch": 0.5283774124374553, + "grad_norm": 0.6188816428184509, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0402, + "step": 18480 + }, + { + "epoch": 0.5286633309506791, + "grad_norm": 0.35280847549438477, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0363, + "step": 18490 + }, + { + "epoch": 0.5289492494639028, + "grad_norm": 0.7289313673973083, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0392, + "step": 18500 + }, + { + "epoch": 0.5292351679771266, + "grad_norm": 0.505050778388977, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0329, + "step": 18510 + }, + { + "epoch": 0.5295210864903502, + "grad_norm": 0.7029705047607422, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0344, + "step": 18520 + }, + { + "epoch": 0.529807005003574, + "grad_norm": 0.2958471477031708, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0431, + "step": 18530 + }, + { + "epoch": 0.5300929235167977, + "grad_norm": 0.9649683237075806, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0329, + "step": 18540 + }, + { + "epoch": 0.5303788420300214, + "grad_norm": 0.24733735620975494, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0354, + "step": 18550 + }, + { + "epoch": 0.5306647605432452, + "grad_norm": 0.44838136434555054, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0334, + "step": 18560 + }, + { + "epoch": 0.5309506790564689, + "grad_norm": 0.4505597949028015, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0338, + "step": 18570 + }, + { + "epoch": 0.5312365975696927, + "grad_norm": 0.44188442826271057, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0326, + "step": 18580 + }, + { + "epoch": 0.5315225160829163, + "grad_norm": 0.4539152979850769, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0369, + "step": 18590 + }, + { + "epoch": 0.5318084345961401, + "grad_norm": 0.8311023712158203, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0441, + "step": 18600 + }, + { + "epoch": 0.5320943531093638, + "grad_norm": 0.53764808177948, + "learning_rate": 8.025779439806006e-06, + "loss": 0.037, + "step": 18610 + }, + { + "epoch": 0.5323802716225876, + "grad_norm": 1.2192102670669556, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0369, + "step": 18620 + }, + { + "epoch": 0.5326661901358113, + "grad_norm": 0.5254611968994141, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0371, + "step": 18630 + }, + { + "epoch": 0.532952108649035, + "grad_norm": 0.585709810256958, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0337, + "step": 18640 + }, + { + "epoch": 0.5332380271622588, + "grad_norm": 0.45416259765625, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0425, + "step": 18650 + }, + { + "epoch": 0.5335239456754824, + "grad_norm": 0.3957739472389221, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0354, + "step": 18660 + }, + { + "epoch": 0.5338098641887062, + "grad_norm": 0.6211117506027222, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0347, + "step": 18670 + }, + { + "epoch": 0.5340957827019299, + "grad_norm": 0.49023327231407166, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0308, + "step": 18680 + }, + { + "epoch": 0.5343817012151537, + "grad_norm": 0.5823351144790649, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0351, + "step": 18690 + }, + { + "epoch": 0.5346676197283774, + "grad_norm": 0.6048677563667297, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0382, + "step": 18700 + }, + { + "epoch": 0.5349535382416012, + "grad_norm": 0.5293828845024109, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0329, + "step": 18710 + }, + { + "epoch": 0.5352394567548249, + "grad_norm": 0.5935509204864502, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0388, + "step": 18720 + }, + { + "epoch": 0.5355253752680486, + "grad_norm": 0.8369598388671875, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0421, + "step": 18730 + }, + { + "epoch": 0.5358112937812723, + "grad_norm": 0.6874870657920837, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0385, + "step": 18740 + }, + { + "epoch": 0.536097212294496, + "grad_norm": 0.43511492013931274, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0405, + "step": 18750 + }, + { + "epoch": 0.5363831308077198, + "grad_norm": 0.662755012512207, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0375, + "step": 18760 + }, + { + "epoch": 0.5366690493209435, + "grad_norm": 0.5519852638244629, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0351, + "step": 18770 + }, + { + "epoch": 0.5369549678341673, + "grad_norm": 0.9711637496948242, + "learning_rate": 7.869858673101027e-06, + "loss": 0.038, + "step": 18780 + }, + { + "epoch": 0.537240886347391, + "grad_norm": 0.4944411516189575, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0416, + "step": 18790 + }, + { + "epoch": 0.5375268048606148, + "grad_norm": 0.5257377624511719, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0349, + "step": 18800 + }, + { + "epoch": 0.5378127233738385, + "grad_norm": 0.4833063781261444, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0414, + "step": 18810 + }, + { + "epoch": 0.5380986418870621, + "grad_norm": 0.4496164917945862, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0369, + "step": 18820 + }, + { + "epoch": 0.5383845604002859, + "grad_norm": 0.6939138174057007, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0337, + "step": 18830 + }, + { + "epoch": 0.5386704789135096, + "grad_norm": 0.32579538226127625, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0371, + "step": 18840 + }, + { + "epoch": 0.5389563974267334, + "grad_norm": 0.35594654083251953, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0366, + "step": 18850 + }, + { + "epoch": 0.5392423159399571, + "grad_norm": 0.6114012002944946, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0372, + "step": 18860 + }, + { + "epoch": 0.5395282344531809, + "grad_norm": 0.8492457270622253, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0346, + "step": 18870 + }, + { + "epoch": 0.5398141529664046, + "grad_norm": 0.5214036703109741, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0388, + "step": 18880 + }, + { + "epoch": 0.5401000714796284, + "grad_norm": 0.428671658039093, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0394, + "step": 18890 + }, + { + "epoch": 0.540385989992852, + "grad_norm": 0.6071562767028809, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0371, + "step": 18900 + }, + { + "epoch": 0.5406719085060757, + "grad_norm": 0.41996505856513977, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0334, + "step": 18910 + }, + { + "epoch": 0.5409578270192995, + "grad_norm": 0.5260844826698303, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0362, + "step": 18920 + }, + { + "epoch": 0.5412437455325232, + "grad_norm": 0.43362122774124146, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0325, + "step": 18930 + }, + { + "epoch": 0.541529664045747, + "grad_norm": 0.4597149193286896, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0345, + "step": 18940 + }, + { + "epoch": 0.5418155825589707, + "grad_norm": 0.6667322516441345, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0421, + "step": 18950 + }, + { + "epoch": 0.5421015010721945, + "grad_norm": 0.8998900651931763, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0368, + "step": 18960 + }, + { + "epoch": 0.5423874195854181, + "grad_norm": 0.5075538158416748, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0327, + "step": 18970 + }, + { + "epoch": 0.5426733380986419, + "grad_norm": 0.38445526361465454, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0446, + "step": 18980 + }, + { + "epoch": 0.5429592566118656, + "grad_norm": 0.696186363697052, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0364, + "step": 18990 + }, + { + "epoch": 0.5432451751250893, + "grad_norm": 0.6371187567710876, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0401, + "step": 19000 + }, + { + "epoch": 0.5435310936383131, + "grad_norm": 0.6122881174087524, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0345, + "step": 19010 + }, + { + "epoch": 0.5438170121515368, + "grad_norm": 0.4222267270088196, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0456, + "step": 19020 + }, + { + "epoch": 0.5441029306647606, + "grad_norm": 0.6122517585754395, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0434, + "step": 19030 + }, + { + "epoch": 0.5443888491779842, + "grad_norm": 0.2783992886543274, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0354, + "step": 19040 + }, + { + "epoch": 0.544674767691208, + "grad_norm": 0.6433000564575195, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0321, + "step": 19050 + }, + { + "epoch": 0.5449606862044317, + "grad_norm": 0.6967030167579651, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0394, + "step": 19060 + }, + { + "epoch": 0.5452466047176555, + "grad_norm": 0.4799044132232666, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0329, + "step": 19070 + }, + { + "epoch": 0.5455325232308792, + "grad_norm": 0.633895993232727, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0316, + "step": 19080 + }, + { + "epoch": 0.5458184417441029, + "grad_norm": 0.5601945519447327, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0449, + "step": 19090 + }, + { + "epoch": 0.5461043602573267, + "grad_norm": 0.4917007088661194, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0351, + "step": 19100 + }, + { + "epoch": 0.5463902787705504, + "grad_norm": 0.4813363254070282, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.029, + "step": 19110 + }, + { + "epoch": 0.5466761972837741, + "grad_norm": 0.5359676480293274, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0346, + "step": 19120 + }, + { + "epoch": 0.5469621157969978, + "grad_norm": 0.6500958204269409, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0374, + "step": 19130 + }, + { + "epoch": 0.5472480343102216, + "grad_norm": 0.7708510756492615, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0332, + "step": 19140 + }, + { + "epoch": 0.5475339528234453, + "grad_norm": 0.45693230628967285, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0344, + "step": 19150 + }, + { + "epoch": 0.5478198713366691, + "grad_norm": 0.6046226620674133, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0342, + "step": 19160 + }, + { + "epoch": 0.5481057898498928, + "grad_norm": 0.5253175497055054, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0449, + "step": 19170 + }, + { + "epoch": 0.5483917083631165, + "grad_norm": 0.3790060877799988, + "learning_rate": 7.507267205473318e-06, + "loss": 0.037, + "step": 19180 + }, + { + "epoch": 0.5486776268763403, + "grad_norm": 0.37709203362464905, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0346, + "step": 19190 + }, + { + "epoch": 0.5489635453895639, + "grad_norm": 0.3940931558609009, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0427, + "step": 19200 + }, + { + "epoch": 0.5492494639027877, + "grad_norm": 0.761299192905426, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0353, + "step": 19210 + }, + { + "epoch": 0.5495353824160114, + "grad_norm": 0.5268495082855225, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0328, + "step": 19220 + }, + { + "epoch": 0.5498213009292352, + "grad_norm": 0.45624151825904846, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0353, + "step": 19230 + }, + { + "epoch": 0.5501072194424589, + "grad_norm": 0.5374972224235535, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0345, + "step": 19240 + }, + { + "epoch": 0.5503931379556827, + "grad_norm": 0.49830907583236694, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0328, + "step": 19250 + }, + { + "epoch": 0.5506790564689064, + "grad_norm": 0.6223296523094177, + "learning_rate": 7.435514206212475e-06, + "loss": 0.037, + "step": 19260 + }, + { + "epoch": 0.55096497498213, + "grad_norm": 0.42801398038864136, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0371, + "step": 19270 + }, + { + "epoch": 0.5512508934953538, + "grad_norm": 0.3872825801372528, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0314, + "step": 19280 + }, + { + "epoch": 0.5515368120085775, + "grad_norm": 0.3967494070529938, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0349, + "step": 19290 + }, + { + "epoch": 0.5518227305218013, + "grad_norm": 0.42383769154548645, + "learning_rate": 7.399737764864619e-06, + "loss": 0.045, + "step": 19300 + }, + { + "epoch": 0.552108649035025, + "grad_norm": 0.48501884937286377, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0373, + "step": 19310 + }, + { + "epoch": 0.5523945675482488, + "grad_norm": 0.3783693015575409, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0334, + "step": 19320 + }, + { + "epoch": 0.5526804860614725, + "grad_norm": 0.5733019709587097, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0369, + "step": 19330 + }, + { + "epoch": 0.5529664045746963, + "grad_norm": 0.5022825002670288, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0375, + "step": 19340 + }, + { + "epoch": 0.5532523230879199, + "grad_norm": 0.5508015155792236, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0415, + "step": 19350 + }, + { + "epoch": 0.5535382416011436, + "grad_norm": 0.5692425966262817, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0401, + "step": 19360 + }, + { + "epoch": 0.5538241601143674, + "grad_norm": 0.7247840762138367, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0373, + "step": 19370 + }, + { + "epoch": 0.5541100786275911, + "grad_norm": 0.633986234664917, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0435, + "step": 19380 + }, + { + "epoch": 0.5543959971408149, + "grad_norm": 0.8598711490631104, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0424, + "step": 19390 + }, + { + "epoch": 0.5546819156540386, + "grad_norm": 0.782328188419342, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0424, + "step": 19400 + }, + { + "epoch": 0.5549678341672624, + "grad_norm": 0.48890456557273865, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0351, + "step": 19410 + }, + { + "epoch": 0.555253752680486, + "grad_norm": 0.4759981036186218, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0395, + "step": 19420 + }, + { + "epoch": 0.5555396711937098, + "grad_norm": 0.6431323885917664, + "learning_rate": 7.283934675167239e-06, + "loss": 0.036, + "step": 19430 + }, + { + "epoch": 0.5558255897069335, + "grad_norm": 0.6633809208869934, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0333, + "step": 19440 + }, + { + "epoch": 0.5561115082201572, + "grad_norm": 0.3405994772911072, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0375, + "step": 19450 + }, + { + "epoch": 0.556397426733381, + "grad_norm": 0.3443987965583801, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0329, + "step": 19460 + }, + { + "epoch": 0.5566833452466047, + "grad_norm": 0.7973398566246033, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0412, + "step": 19470 + }, + { + "epoch": 0.5569692637598285, + "grad_norm": 0.43843239545822144, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0302, + "step": 19480 + }, + { + "epoch": 0.5572551822730522, + "grad_norm": 0.6797782182693481, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0401, + "step": 19490 + }, + { + "epoch": 0.557541100786276, + "grad_norm": 0.5020610690116882, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0329, + "step": 19500 + }, + { + "epoch": 0.5578270192994996, + "grad_norm": 0.5093050003051758, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0381, + "step": 19510 + }, + { + "epoch": 0.5581129378127234, + "grad_norm": 0.6136947870254517, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0373, + "step": 19520 + }, + { + "epoch": 0.5583988563259471, + "grad_norm": 0.4213317930698395, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0338, + "step": 19530 + }, + { + "epoch": 0.5586847748391708, + "grad_norm": 0.6560636162757874, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0332, + "step": 19540 + }, + { + "epoch": 0.5589706933523946, + "grad_norm": 0.41303765773773193, + "learning_rate": 7.177693135871202e-06, + "loss": 0.03, + "step": 19550 + }, + { + "epoch": 0.5592566118656183, + "grad_norm": 0.5260538458824158, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0328, + "step": 19560 + }, + { + "epoch": 0.559542530378842, + "grad_norm": 0.6076327562332153, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0428, + "step": 19570 + }, + { + "epoch": 0.5598284488920657, + "grad_norm": 0.635111927986145, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0318, + "step": 19580 + }, + { + "epoch": 0.5601143674052895, + "grad_norm": 0.7933056354522705, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0357, + "step": 19590 + }, + { + "epoch": 0.5604002859185132, + "grad_norm": 0.44312241673469543, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0289, + "step": 19600 + }, + { + "epoch": 0.560686204431737, + "grad_norm": 0.36346134543418884, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0354, + "step": 19610 + }, + { + "epoch": 0.5609721229449607, + "grad_norm": 0.49605289101600647, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0367, + "step": 19620 + }, + { + "epoch": 0.5612580414581844, + "grad_norm": 0.7115452289581299, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0354, + "step": 19630 + }, + { + "epoch": 0.5615439599714082, + "grad_norm": 0.650925874710083, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0353, + "step": 19640 + }, + { + "epoch": 0.5618298784846318, + "grad_norm": 0.5046663880348206, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0294, + "step": 19650 + }, + { + "epoch": 0.5621157969978556, + "grad_norm": 0.4441855549812317, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0326, + "step": 19660 + }, + { + "epoch": 0.5624017155110793, + "grad_norm": 0.3956650495529175, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0446, + "step": 19670 + }, + { + "epoch": 0.5626876340243031, + "grad_norm": 0.5384211540222168, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0331, + "step": 19680 + }, + { + "epoch": 0.5629735525375268, + "grad_norm": 0.6183366775512695, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0324, + "step": 19690 + }, + { + "epoch": 0.5632594710507506, + "grad_norm": 0.9116242527961731, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0341, + "step": 19700 + }, + { + "epoch": 0.5635453895639743, + "grad_norm": 0.8171015381813049, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0306, + "step": 19710 + }, + { + "epoch": 0.563831308077198, + "grad_norm": 0.42670243978500366, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0336, + "step": 19720 + }, + { + "epoch": 0.5641172265904217, + "grad_norm": 0.7338811159133911, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0363, + "step": 19730 + }, + { + "epoch": 0.5644031451036454, + "grad_norm": 0.5576338171958923, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0371, + "step": 19740 + }, + { + "epoch": 0.5646890636168692, + "grad_norm": 0.7390629649162292, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0382, + "step": 19750 + }, + { + "epoch": 0.5649749821300929, + "grad_norm": 0.801812469959259, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0379, + "step": 19760 + }, + { + "epoch": 0.5652609006433167, + "grad_norm": 0.5697385668754578, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0369, + "step": 19770 + }, + { + "epoch": 0.5655468191565404, + "grad_norm": 0.4180932343006134, + "learning_rate": 6.975884226362e-06, + "loss": 0.039, + "step": 19780 + }, + { + "epoch": 0.5658327376697642, + "grad_norm": 0.648389995098114, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0346, + "step": 19790 + }, + { + "epoch": 0.5661186561829878, + "grad_norm": 0.9673929214477539, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0392, + "step": 19800 + }, + { + "epoch": 0.5664045746962115, + "grad_norm": 0.4793975353240967, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0321, + "step": 19810 + }, + { + "epoch": 0.5666904932094353, + "grad_norm": 0.5206098556518555, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0319, + "step": 19820 + }, + { + "epoch": 0.566976411722659, + "grad_norm": 0.39929306507110596, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0335, + "step": 19830 + }, + { + "epoch": 0.5672623302358828, + "grad_norm": 0.6819440722465515, + "learning_rate": 6.923644220932124e-06, + "loss": 0.0338, + "step": 19840 + }, + { + "epoch": 0.5675482487491065, + "grad_norm": 0.7612042427062988, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0345, + "step": 19850 + }, + { + "epoch": 0.5678341672623303, + "grad_norm": 0.472676545381546, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0388, + "step": 19860 + }, + { + "epoch": 0.568120085775554, + "grad_norm": 0.48102107644081116, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0304, + "step": 19870 + }, + { + "epoch": 0.5684060042887777, + "grad_norm": 0.4174644649028778, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0315, + "step": 19880 + }, + { + "epoch": 0.5686919228020014, + "grad_norm": 0.4218151271343231, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0413, + "step": 19890 + }, + { + "epoch": 0.5689778413152251, + "grad_norm": 0.8243978023529053, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0399, + "step": 19900 + }, + { + "epoch": 0.5692637598284489, + "grad_norm": 0.400924414396286, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0298, + "step": 19910 + }, + { + "epoch": 0.5695496783416726, + "grad_norm": 0.5199277400970459, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0351, + "step": 19920 + }, + { + "epoch": 0.5698355968548964, + "grad_norm": 0.5238781571388245, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0374, + "step": 19930 + }, + { + "epoch": 0.5701215153681201, + "grad_norm": 0.7451756596565247, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0378, + "step": 19940 + }, + { + "epoch": 0.5704074338813439, + "grad_norm": 0.5029926300048828, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0391, + "step": 19950 + }, + { + "epoch": 0.5706933523945675, + "grad_norm": 0.5532147884368896, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0383, + "step": 19960 + }, + { + "epoch": 0.5709792709077913, + "grad_norm": 0.5694131851196289, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0364, + "step": 19970 + }, + { + "epoch": 0.571265189421015, + "grad_norm": 0.5066515803337097, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0363, + "step": 19980 + }, + { + "epoch": 0.5715511079342387, + "grad_norm": 0.5676470398902893, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0327, + "step": 19990 + }, + { + "epoch": 0.5718370264474625, + "grad_norm": 0.37414318323135376, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0395, + "step": 20000 + }, + { + "epoch": 0.5721229449606862, + "grad_norm": 0.5888793468475342, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0372, + "step": 20010 + }, + { + "epoch": 0.57240886347391, + "grad_norm": 0.6593262553215027, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0329, + "step": 20020 + }, + { + "epoch": 0.5726947819871336, + "grad_norm": 0.6382879614830017, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0286, + "step": 20030 + }, + { + "epoch": 0.5729807005003574, + "grad_norm": 0.6364927887916565, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0383, + "step": 20040 + }, + { + "epoch": 0.5732666190135811, + "grad_norm": 0.4102194011211395, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0342, + "step": 20050 + }, + { + "epoch": 0.5735525375268049, + "grad_norm": 0.6449235081672668, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0315, + "step": 20060 + }, + { + "epoch": 0.5738384560400286, + "grad_norm": 0.708431601524353, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0316, + "step": 20070 + }, + { + "epoch": 0.5741243745532523, + "grad_norm": 0.46444272994995117, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0352, + "step": 20080 + }, + { + "epoch": 0.5744102930664761, + "grad_norm": 0.7026715278625488, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0337, + "step": 20090 + }, + { + "epoch": 0.5746962115796997, + "grad_norm": 0.43397894501686096, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0303, + "step": 20100 + }, + { + "epoch": 0.5749821300929235, + "grad_norm": 0.4937734305858612, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0403, + "step": 20110 + }, + { + "epoch": 0.5752680486061472, + "grad_norm": 0.5981410145759583, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0375, + "step": 20120 + }, + { + "epoch": 0.575553967119371, + "grad_norm": 0.5616198778152466, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0314, + "step": 20130 + }, + { + "epoch": 0.5758398856325947, + "grad_norm": 0.35028502345085144, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0367, + "step": 20140 + }, + { + "epoch": 0.5761258041458185, + "grad_norm": 0.3556109666824341, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0308, + "step": 20150 + }, + { + "epoch": 0.5764117226590422, + "grad_norm": 0.579409658908844, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0344, + "step": 20160 + }, + { + "epoch": 0.5766976411722659, + "grad_norm": 0.4484683573246002, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0312, + "step": 20170 + }, + { + "epoch": 0.5769835596854896, + "grad_norm": 0.3636038899421692, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0337, + "step": 20180 + }, + { + "epoch": 0.5772694781987133, + "grad_norm": 0.6667287349700928, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0343, + "step": 20190 + }, + { + "epoch": 0.5775553967119371, + "grad_norm": 0.26031574606895447, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0303, + "step": 20200 + }, + { + "epoch": 0.5778413152251608, + "grad_norm": 0.6683355569839478, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0316, + "step": 20210 + }, + { + "epoch": 0.5781272337383846, + "grad_norm": 0.4097786843776703, + "learning_rate": 6.596880604028027e-06, + "loss": 0.0346, + "step": 20220 + }, + { + "epoch": 0.5784131522516083, + "grad_norm": 0.45405757427215576, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0345, + "step": 20230 + }, + { + "epoch": 0.5786990707648321, + "grad_norm": 0.28291839361190796, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0323, + "step": 20240 + }, + { + "epoch": 0.5789849892780558, + "grad_norm": 0.5656186938285828, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0299, + "step": 20250 + }, + { + "epoch": 0.5792709077912794, + "grad_norm": 0.6780310869216919, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0309, + "step": 20260 + }, + { + "epoch": 0.5795568263045032, + "grad_norm": 0.3968813121318817, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0347, + "step": 20270 + }, + { + "epoch": 0.5798427448177269, + "grad_norm": 0.6598440408706665, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0329, + "step": 20280 + }, + { + "epoch": 0.5801286633309507, + "grad_norm": 0.4988970458507538, + "learning_rate": 6.53748481975927e-06, + "loss": 0.038, + "step": 20290 + }, + { + "epoch": 0.5804145818441744, + "grad_norm": 0.8016706705093384, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0358, + "step": 20300 + }, + { + "epoch": 0.5807005003573982, + "grad_norm": 0.8367684483528137, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0354, + "step": 20310 + }, + { + "epoch": 0.5809864188706219, + "grad_norm": 0.5730129480361938, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0421, + "step": 20320 + }, + { + "epoch": 0.5812723373838456, + "grad_norm": 0.43631577491760254, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0361, + "step": 20330 + }, + { + "epoch": 0.5815582558970693, + "grad_norm": 0.7001264691352844, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0355, + "step": 20340 + }, + { + "epoch": 0.581844174410293, + "grad_norm": 0.4988951086997986, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0387, + "step": 20350 + }, + { + "epoch": 0.5821300929235168, + "grad_norm": 0.45731016993522644, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0398, + "step": 20360 + }, + { + "epoch": 0.5824160114367405, + "grad_norm": 0.38684406876564026, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0345, + "step": 20370 + }, + { + "epoch": 0.5827019299499643, + "grad_norm": 0.3924580514431, + "learning_rate": 6.461496350649529e-06, + "loss": 0.037, + "step": 20380 + }, + { + "epoch": 0.582987848463188, + "grad_norm": 0.43735265731811523, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0371, + "step": 20390 + }, + { + "epoch": 0.5832737669764118, + "grad_norm": 0.4595138430595398, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0337, + "step": 20400 + }, + { + "epoch": 0.5835596854896354, + "grad_norm": 0.429569810628891, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0284, + "step": 20410 + }, + { + "epoch": 0.5838456040028592, + "grad_norm": 0.5399166345596313, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0353, + "step": 20420 + }, + { + "epoch": 0.5841315225160829, + "grad_norm": 0.5698734521865845, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0361, + "step": 20430 + }, + { + "epoch": 0.5844174410293066, + "grad_norm": 0.35422587394714355, + "learning_rate": 6.411076603575166e-06, + "loss": 0.033, + "step": 20440 + }, + { + "epoch": 0.5847033595425304, + "grad_norm": 0.4475875198841095, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0344, + "step": 20450 + }, + { + "epoch": 0.5849892780557541, + "grad_norm": 0.4950159192085266, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0428, + "step": 20460 + }, + { + "epoch": 0.5852751965689779, + "grad_norm": 0.695249617099762, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0354, + "step": 20470 + }, + { + "epoch": 0.5855611150822015, + "grad_norm": 0.2538593113422394, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0383, + "step": 20480 + }, + { + "epoch": 0.5858470335954253, + "grad_norm": 0.6770910024642944, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0364, + "step": 20490 + }, + { + "epoch": 0.586132952108649, + "grad_norm": 0.7187057733535767, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0319, + "step": 20500 + }, + { + "epoch": 0.5864188706218728, + "grad_norm": 0.34853193163871765, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.033, + "step": 20510 + }, + { + "epoch": 0.5867047891350965, + "grad_norm": 0.8484768271446228, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0331, + "step": 20520 + }, + { + "epoch": 0.5869907076483202, + "grad_norm": 0.6645244359970093, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0353, + "step": 20530 + }, + { + "epoch": 0.587276626161544, + "grad_norm": 0.5094996690750122, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0374, + "step": 20540 + }, + { + "epoch": 0.5875625446747677, + "grad_norm": 0.5012859106063843, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0329, + "step": 20550 + }, + { + "epoch": 0.5878484631879914, + "grad_norm": 0.6465861797332764, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0282, + "step": 20560 + }, + { + "epoch": 0.5881343817012151, + "grad_norm": 0.5694834589958191, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0313, + "step": 20570 + }, + { + "epoch": 0.5884203002144389, + "grad_norm": 0.4945555627346039, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0353, + "step": 20580 + }, + { + "epoch": 0.5887062187276626, + "grad_norm": 0.5606586933135986, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0343, + "step": 20590 + }, + { + "epoch": 0.5889921372408864, + "grad_norm": 0.6913802027702332, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0358, + "step": 20600 + }, + { + "epoch": 0.5892780557541101, + "grad_norm": 0.8119901418685913, + "learning_rate": 6.269280523549298e-06, + "loss": 0.038, + "step": 20610 + }, + { + "epoch": 0.5895639742673338, + "grad_norm": 0.5558752417564392, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0311, + "step": 20620 + }, + { + "epoch": 0.5898498927805575, + "grad_norm": 0.45028987526893616, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0321, + "step": 20630 + }, + { + "epoch": 0.5901358112937812, + "grad_norm": 0.3697125017642975, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0331, + "step": 20640 + }, + { + "epoch": 0.590421729807005, + "grad_norm": 0.5406038761138916, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0445, + "step": 20650 + }, + { + "epoch": 0.5907076483202287, + "grad_norm": 0.4301048219203949, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0371, + "step": 20660 + }, + { + "epoch": 0.5909935668334525, + "grad_norm": 0.6343403458595276, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0353, + "step": 20670 + }, + { + "epoch": 0.5912794853466762, + "grad_norm": 0.4666310250759125, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0352, + "step": 20680 + }, + { + "epoch": 0.5915654038599, + "grad_norm": 0.7471063733100891, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0352, + "step": 20690 + }, + { + "epoch": 0.5918513223731237, + "grad_norm": 0.9971692562103271, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0327, + "step": 20700 + }, + { + "epoch": 0.5921372408863473, + "grad_norm": 0.5646237134933472, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0365, + "step": 20710 + }, + { + "epoch": 0.5924231593995711, + "grad_norm": 0.46781328320503235, + "learning_rate": 6.17838207381795e-06, + "loss": 0.042, + "step": 20720 + }, + { + "epoch": 0.5927090779127948, + "grad_norm": 0.7061547040939331, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0484, + "step": 20730 + }, + { + "epoch": 0.5929949964260186, + "grad_norm": 0.6651175618171692, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0353, + "step": 20740 + }, + { + "epoch": 0.5932809149392423, + "grad_norm": 0.5959596037864685, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0344, + "step": 20750 + }, + { + "epoch": 0.5935668334524661, + "grad_norm": 0.5869056582450867, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0389, + "step": 20760 + }, + { + "epoch": 0.5938527519656898, + "grad_norm": 0.42101356387138367, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0288, + "step": 20770 + }, + { + "epoch": 0.5941386704789136, + "grad_norm": 0.6310023069381714, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0362, + "step": 20780 + }, + { + "epoch": 0.5944245889921372, + "grad_norm": 0.6737013459205627, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0377, + "step": 20790 + }, + { + "epoch": 0.5947105075053609, + "grad_norm": 0.6716046333312988, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0415, + "step": 20800 + }, + { + "epoch": 0.5949964260185847, + "grad_norm": 0.9742669463157654, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0337, + "step": 20810 + }, + { + "epoch": 0.5952823445318084, + "grad_norm": 0.571782648563385, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0362, + "step": 20820 + }, + { + "epoch": 0.5955682630450322, + "grad_norm": 0.9673911333084106, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0362, + "step": 20830 + }, + { + "epoch": 0.5958541815582559, + "grad_norm": 0.5391695499420166, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0331, + "step": 20840 + }, + { + "epoch": 0.5961401000714797, + "grad_norm": 1.4766349792480469, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0332, + "step": 20850 + }, + { + "epoch": 0.5964260185847033, + "grad_norm": 0.6329004168510437, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0375, + "step": 20860 + }, + { + "epoch": 0.5967119370979271, + "grad_norm": 0.6745501160621643, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0347, + "step": 20870 + }, + { + "epoch": 0.5969978556111508, + "grad_norm": 0.3006536364555359, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0321, + "step": 20880 + }, + { + "epoch": 0.5972837741243745, + "grad_norm": 0.4666125476360321, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0363, + "step": 20890 + }, + { + "epoch": 0.5975696926375983, + "grad_norm": 0.3881456255912781, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0318, + "step": 20900 + }, + { + "epoch": 0.597855611150822, + "grad_norm": 0.4211449921131134, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0357, + "step": 20910 + }, + { + "epoch": 0.5981415296640458, + "grad_norm": 1.125683307647705, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0364, + "step": 20920 + }, + { + "epoch": 0.5984274481772694, + "grad_norm": 0.9670853614807129, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0385, + "step": 20930 + }, + { + "epoch": 0.5987133666904932, + "grad_norm": 0.7302138209342957, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0321, + "step": 20940 + }, + { + "epoch": 0.5989992852037169, + "grad_norm": 0.7883613109588623, + "learning_rate": 5.990549152010853e-06, + "loss": 0.038, + "step": 20950 + }, + { + "epoch": 0.5992852037169407, + "grad_norm": 0.44051188230514526, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0423, + "step": 20960 + }, + { + "epoch": 0.5995711222301644, + "grad_norm": 0.5225116014480591, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0293, + "step": 20970 + }, + { + "epoch": 0.5998570407433881, + "grad_norm": 0.44672495126724243, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0314, + "step": 20980 + }, + { + "epoch": 0.6001429592566119, + "grad_norm": 0.4489240050315857, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0298, + "step": 20990 + }, + { + "epoch": 0.6004288777698356, + "grad_norm": 0.3942757844924927, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0323, + "step": 21000 + }, + { + "epoch": 0.6007147962830593, + "grad_norm": 0.5079668760299683, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0435, + "step": 21010 + }, + { + "epoch": 0.601000714796283, + "grad_norm": 0.5057359933853149, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0364, + "step": 21020 + }, + { + "epoch": 0.6012866333095068, + "grad_norm": 0.4823545515537262, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0408, + "step": 21030 + }, + { + "epoch": 0.6015725518227305, + "grad_norm": 0.42647498846054077, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0366, + "step": 21040 + }, + { + "epoch": 0.6018584703359543, + "grad_norm": 0.5967830419540405, + "learning_rate": 5.909845843697164e-06, + "loss": 0.037, + "step": 21050 + }, + { + "epoch": 0.602144388849178, + "grad_norm": 0.4567292034626007, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0306, + "step": 21060 + }, + { + "epoch": 0.6024303073624017, + "grad_norm": 0.6767273545265198, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0447, + "step": 21070 + }, + { + "epoch": 0.6027162258756255, + "grad_norm": 0.2957002520561218, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0339, + "step": 21080 + }, + { + "epoch": 0.6030021443888491, + "grad_norm": 0.6870969533920288, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0313, + "step": 21090 + }, + { + "epoch": 0.6032880629020729, + "grad_norm": 0.530910313129425, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0377, + "step": 21100 + }, + { + "epoch": 0.6035739814152966, + "grad_norm": 0.21370625495910645, + "learning_rate": 5.86170998451151e-06, + "loss": 0.032, + "step": 21110 + }, + { + "epoch": 0.6038598999285204, + "grad_norm": 0.6039503812789917, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0258, + "step": 21120 + }, + { + "epoch": 0.6041458184417441, + "grad_norm": 0.5375682711601257, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0341, + "step": 21130 + }, + { + "epoch": 0.6044317369549679, + "grad_norm": 0.4819096326828003, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0309, + "step": 21140 + }, + { + "epoch": 0.6047176554681916, + "grad_norm": 0.31165415048599243, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0278, + "step": 21150 + }, + { + "epoch": 0.6050035739814152, + "grad_norm": 0.2781001925468445, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0342, + "step": 21160 + }, + { + "epoch": 0.605289492494639, + "grad_norm": 0.44726037979125977, + "learning_rate": 5.813791207086085e-06, + "loss": 0.032, + "step": 21170 + }, + { + "epoch": 0.6055754110078627, + "grad_norm": 0.5762766599655151, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0325, + "step": 21180 + }, + { + "epoch": 0.6058613295210865, + "grad_norm": 0.49829939007759094, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0322, + "step": 21190 + }, + { + "epoch": 0.6061472480343102, + "grad_norm": 0.4683297276496887, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0369, + "step": 21200 + }, + { + "epoch": 0.606433166547534, + "grad_norm": 0.662159264087677, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0278, + "step": 21210 + }, + { + "epoch": 0.6067190850607577, + "grad_norm": 0.4397001564502716, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0366, + "step": 21220 + }, + { + "epoch": 0.6070050035739815, + "grad_norm": 0.4977007508277893, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0293, + "step": 21230 + }, + { + "epoch": 0.6072909220872051, + "grad_norm": 0.3705490827560425, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0315, + "step": 21240 + }, + { + "epoch": 0.6075768406004288, + "grad_norm": 0.6350240111351013, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0286, + "step": 21250 + }, + { + "epoch": 0.6078627591136526, + "grad_norm": 0.5590423941612244, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0373, + "step": 21260 + }, + { + "epoch": 0.6081486776268763, + "grad_norm": 0.5244049429893494, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0325, + "step": 21270 + }, + { + "epoch": 0.6084345961401001, + "grad_norm": 1.082044005393982, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0373, + "step": 21280 + }, + { + "epoch": 0.6087205146533238, + "grad_norm": 0.614028811454773, + "learning_rate": 5.71861298612245e-06, + "loss": 0.031, + "step": 21290 + }, + { + "epoch": 0.6090064331665476, + "grad_norm": 0.783205509185791, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0289, + "step": 21300 + }, + { + "epoch": 0.6092923516797712, + "grad_norm": 0.5420807600021362, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.031, + "step": 21310 + }, + { + "epoch": 0.609578270192995, + "grad_norm": 0.42979222536087036, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0291, + "step": 21320 + }, + { + "epoch": 0.6098641887062187, + "grad_norm": 0.44511356949806213, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.031, + "step": 21330 + }, + { + "epoch": 0.6101501072194424, + "grad_norm": 0.528799831867218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0269, + "step": 21340 + }, + { + "epoch": 0.6104360257326662, + "grad_norm": 0.43274471163749695, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0438, + "step": 21350 + }, + { + "epoch": 0.6107219442458899, + "grad_norm": 0.8020172715187073, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0393, + "step": 21360 + }, + { + "epoch": 0.6110078627591137, + "grad_norm": 0.4354296028614044, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0338, + "step": 21370 + }, + { + "epoch": 0.6112937812723374, + "grad_norm": 0.587364673614502, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0359, + "step": 21380 + }, + { + "epoch": 0.6115796997855611, + "grad_norm": 0.5426310300827026, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0333, + "step": 21390 + }, + { + "epoch": 0.6118656182987848, + "grad_norm": 0.5900459289550781, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0344, + "step": 21400 + }, + { + "epoch": 0.6121515368120086, + "grad_norm": 0.5652357935905457, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0396, + "step": 21410 + }, + { + "epoch": 0.6124374553252323, + "grad_norm": 0.5287114977836609, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0387, + "step": 21420 + }, + { + "epoch": 0.612723373838456, + "grad_norm": 0.7939184904098511, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0351, + "step": 21430 + }, + { + "epoch": 0.6130092923516798, + "grad_norm": 0.6840642094612122, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0349, + "step": 21440 + }, + { + "epoch": 0.6132952108649035, + "grad_norm": 0.3717428147792816, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0336, + "step": 21450 + }, + { + "epoch": 0.6135811293781273, + "grad_norm": 0.5073713064193726, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0326, + "step": 21460 + }, + { + "epoch": 0.6138670478913509, + "grad_norm": 1.1579232215881348, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0388, + "step": 21470 + }, + { + "epoch": 0.6141529664045747, + "grad_norm": 0.4209369122982025, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0307, + "step": 21480 + }, + { + "epoch": 0.6144388849177984, + "grad_norm": 0.38663822412490845, + "learning_rate": 5.561973825289734e-06, + "loss": 0.037, + "step": 21490 + }, + { + "epoch": 0.6147248034310222, + "grad_norm": 0.538270890712738, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0333, + "step": 21500 + }, + { + "epoch": 0.6150107219442459, + "grad_norm": 0.28280535340309143, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0319, + "step": 21510 + }, + { + "epoch": 0.6152966404574696, + "grad_norm": 0.5407803058624268, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0352, + "step": 21520 + }, + { + "epoch": 0.6155825589706934, + "grad_norm": 1.4600974321365356, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0409, + "step": 21530 + }, + { + "epoch": 0.615868477483917, + "grad_norm": 0.659900426864624, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0322, + "step": 21540 + }, + { + "epoch": 0.6161543959971408, + "grad_norm": 0.6401934623718262, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0339, + "step": 21550 + }, + { + "epoch": 0.6164403145103645, + "grad_norm": 0.6409866213798523, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0331, + "step": 21560 + }, + { + "epoch": 0.6167262330235883, + "grad_norm": 0.6627630591392517, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0332, + "step": 21570 + }, + { + "epoch": 0.617012151536812, + "grad_norm": 0.6180721521377563, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0327, + "step": 21580 + }, + { + "epoch": 0.6172980700500358, + "grad_norm": 0.4689866006374359, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0276, + "step": 21590 + }, + { + "epoch": 0.6175839885632595, + "grad_norm": 0.5039265751838684, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0427, + "step": 21600 + }, + { + "epoch": 0.6178699070764831, + "grad_norm": 0.5313833355903625, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0351, + "step": 21610 + }, + { + "epoch": 0.6181558255897069, + "grad_norm": 0.4919044077396393, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0327, + "step": 21620 + }, + { + "epoch": 0.6184417441029306, + "grad_norm": 0.5446444153785706, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0331, + "step": 21630 + }, + { + "epoch": 0.6187276626161544, + "grad_norm": 0.5198109745979309, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.032, + "step": 21640 + }, + { + "epoch": 0.6190135811293781, + "grad_norm": 0.5684625506401062, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0339, + "step": 21650 + }, + { + "epoch": 0.6192994996426019, + "grad_norm": 0.6882810592651367, + "learning_rate": 5.430834687545416e-06, + "loss": 0.035, + "step": 21660 + }, + { + "epoch": 0.6195854181558256, + "grad_norm": 0.7360101938247681, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0306, + "step": 21670 + }, + { + "epoch": 0.6198713366690494, + "grad_norm": 0.5557180047035217, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0241, + "step": 21680 + }, + { + "epoch": 0.620157255182273, + "grad_norm": 0.4302096962928772, + "learning_rate": 5.407887295494495e-06, + "loss": 0.035, + "step": 21690 + }, + { + "epoch": 0.6204431736954967, + "grad_norm": 0.4740016460418701, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0331, + "step": 21700 + }, + { + "epoch": 0.6207290922087205, + "grad_norm": 0.5400598049163818, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0297, + "step": 21710 + }, + { + "epoch": 0.6210150107219442, + "grad_norm": 0.4270641803741455, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0334, + "step": 21720 + }, + { + "epoch": 0.621300929235168, + "grad_norm": 0.41063550114631653, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0375, + "step": 21730 + }, + { + "epoch": 0.6215868477483917, + "grad_norm": 0.48556044697761536, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0291, + "step": 21740 + }, + { + "epoch": 0.6218727662616155, + "grad_norm": 0.2872731387615204, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0323, + "step": 21750 + }, + { + "epoch": 0.6221586847748392, + "grad_norm": 0.4088454246520996, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0307, + "step": 21760 + }, + { + "epoch": 0.622444603288063, + "grad_norm": 0.42600440979003906, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0326, + "step": 21770 + }, + { + "epoch": 0.6227305218012866, + "grad_norm": 0.36466315388679504, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0337, + "step": 21780 + }, + { + "epoch": 0.6230164403145103, + "grad_norm": 0.588921308517456, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0336, + "step": 21790 + }, + { + "epoch": 0.6233023588277341, + "grad_norm": 0.44768571853637695, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0326, + "step": 21800 + }, + { + "epoch": 0.6235882773409578, + "grad_norm": 1.1612637042999268, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0355, + "step": 21810 + }, + { + "epoch": 0.6238741958541816, + "grad_norm": 1.0912114381790161, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0436, + "step": 21820 + }, + { + "epoch": 0.6241601143674053, + "grad_norm": 0.5813164710998535, + "learning_rate": 5.301584321328435e-06, + "loss": 0.034, + "step": 21830 + }, + { + "epoch": 0.624446032880629, + "grad_norm": 0.45064911246299744, + "learning_rate": 5.294041118587667e-06, + "loss": 0.032, + "step": 21840 + }, + { + "epoch": 0.6247319513938527, + "grad_norm": 0.5173943638801575, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0322, + "step": 21850 + }, + { + "epoch": 0.6250178699070765, + "grad_norm": 0.41157352924346924, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0319, + "step": 21860 + }, + { + "epoch": 0.6253037884203002, + "grad_norm": 0.5711286067962646, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0288, + "step": 21870 + }, + { + "epoch": 0.6255897069335239, + "grad_norm": 0.5108116865158081, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0325, + "step": 21880 + }, + { + "epoch": 0.6258756254467477, + "grad_norm": 0.49562424421310425, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0306, + "step": 21890 + }, + { + "epoch": 0.6261615439599714, + "grad_norm": 0.3392108976840973, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0358, + "step": 21900 + }, + { + "epoch": 0.6264474624731952, + "grad_norm": 1.0588114261627197, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0404, + "step": 21910 + }, + { + "epoch": 0.6267333809864188, + "grad_norm": 0.6979959607124329, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0349, + "step": 21920 + }, + { + "epoch": 0.6270192994996426, + "grad_norm": 0.3185918927192688, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0276, + "step": 21930 + }, + { + "epoch": 0.6273052180128663, + "grad_norm": 0.3921501338481903, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0326, + "step": 21940 + }, + { + "epoch": 0.6275911365260901, + "grad_norm": 0.9666212797164917, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0346, + "step": 21950 + }, + { + "epoch": 0.6278770550393138, + "grad_norm": 0.4483211040496826, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0306, + "step": 21960 + }, + { + "epoch": 0.6281629735525375, + "grad_norm": 0.4839077293872833, + "learning_rate": 5.196592054173714e-06, + "loss": 0.026, + "step": 21970 + }, + { + "epoch": 0.6284488920657613, + "grad_norm": 0.5054528117179871, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0299, + "step": 21980 + }, + { + "epoch": 0.628734810578985, + "grad_norm": 0.5953076481819153, + "learning_rate": 5.181701567303612e-06, + "loss": 0.036, + "step": 21990 + }, + { + "epoch": 0.6290207290922087, + "grad_norm": 0.39300060272216797, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0358, + "step": 22000 + }, + { + "epoch": 0.6293066476054324, + "grad_norm": 0.42864665389060974, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0363, + "step": 22010 + }, + { + "epoch": 0.6295925661186562, + "grad_norm": 0.33609238266944885, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0398, + "step": 22020 + }, + { + "epoch": 0.6298784846318799, + "grad_norm": 0.4237107038497925, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0319, + "step": 22030 + }, + { + "epoch": 0.6301644031451037, + "grad_norm": 0.42774054408073425, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0332, + "step": 22040 + }, + { + "epoch": 0.6304503216583274, + "grad_norm": 0.8992825150489807, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0396, + "step": 22050 + }, + { + "epoch": 0.630736240171551, + "grad_norm": 0.20832861959934235, + "learning_rate": 5.129800405815733e-06, + "loss": 0.03, + "step": 22060 + }, + { + "epoch": 0.6310221586847748, + "grad_norm": 0.5961321592330933, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0429, + "step": 22070 + }, + { + "epoch": 0.6313080771979985, + "grad_norm": 0.5037736296653748, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0332, + "step": 22080 + }, + { + "epoch": 0.6315939957112223, + "grad_norm": 0.383732408285141, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0293, + "step": 22090 + }, + { + "epoch": 0.631879914224446, + "grad_norm": 0.8124368786811829, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0286, + "step": 22100 + }, + { + "epoch": 0.6321658327376698, + "grad_norm": 0.96833735704422, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0346, + "step": 22110 + }, + { + "epoch": 0.6324517512508935, + "grad_norm": 0.42382001876831055, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0345, + "step": 22120 + }, + { + "epoch": 0.6327376697641173, + "grad_norm": 0.5928776860237122, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0314, + "step": 22130 + }, + { + "epoch": 0.633023588277341, + "grad_norm": 0.7822670340538025, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0335, + "step": 22140 + }, + { + "epoch": 0.6333095067905646, + "grad_norm": 0.6383520364761353, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0321, + "step": 22150 + }, + { + "epoch": 0.6335954253037884, + "grad_norm": 0.3413240611553192, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0314, + "step": 22160 + }, + { + "epoch": 0.6338813438170121, + "grad_norm": 0.5960783958435059, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0385, + "step": 22170 + }, + { + "epoch": 0.6341672623302359, + "grad_norm": 0.2557702660560608, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0316, + "step": 22180 + }, + { + "epoch": 0.6344531808434596, + "grad_norm": 0.6229982376098633, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0325, + "step": 22190 + }, + { + "epoch": 0.6347390993566834, + "grad_norm": 0.5080077052116394, + "learning_rate": 5.027013727107874e-06, + "loss": 0.036, + "step": 22200 + }, + { + "epoch": 0.6350250178699071, + "grad_norm": 0.5630851984024048, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0279, + "step": 22210 + }, + { + "epoch": 0.6353109363831309, + "grad_norm": 0.81584233045578, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0343, + "step": 22220 + }, + { + "epoch": 0.6355968548963545, + "grad_norm": 0.3985321521759033, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0289, + "step": 22230 + }, + { + "epoch": 0.6358827734095782, + "grad_norm": 0.4481184482574463, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0345, + "step": 22240 + }, + { + "epoch": 0.636168691922802, + "grad_norm": 0.3640075623989105, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0307, + "step": 22250 + }, + { + "epoch": 0.6364546104360257, + "grad_norm": 0.4006771147251129, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0381, + "step": 22260 + }, + { + "epoch": 0.6367405289492495, + "grad_norm": 0.7638134360313416, + "learning_rate": 4.976134120528886e-06, + "loss": 0.039, + "step": 22270 + }, + { + "epoch": 0.6370264474624732, + "grad_norm": 0.4820837080478668, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0281, + "step": 22280 + }, + { + "epoch": 0.637312365975697, + "grad_norm": 0.5928444266319275, + "learning_rate": 4.961660586405147e-06, + "loss": 0.033, + "step": 22290 + }, + { + "epoch": 0.6375982844889206, + "grad_norm": 0.50687575340271, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0357, + "step": 22300 + }, + { + "epoch": 0.6378842030021444, + "grad_norm": 0.673939049243927, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0301, + "step": 22310 + }, + { + "epoch": 0.6381701215153681, + "grad_norm": 0.4300031065940857, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.029, + "step": 22320 + }, + { + "epoch": 0.6384560400285918, + "grad_norm": 0.6585102677345276, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0333, + "step": 22330 + }, + { + "epoch": 0.6387419585418156, + "grad_norm": 0.6430448889732361, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0345, + "step": 22340 + }, + { + "epoch": 0.6390278770550393, + "grad_norm": 0.8272712826728821, + "learning_rate": 4.918410326949594e-06, + "loss": 0.034, + "step": 22350 + }, + { + "epoch": 0.6393137955682631, + "grad_norm": 0.7631726861000061, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0321, + "step": 22360 + }, + { + "epoch": 0.6395997140814867, + "grad_norm": 0.5562252402305603, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0339, + "step": 22370 + }, + { + "epoch": 0.6398856325947105, + "grad_norm": 0.6027814149856567, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0355, + "step": 22380 + }, + { + "epoch": 0.6401715511079342, + "grad_norm": 0.3548984229564667, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0343, + "step": 22390 + }, + { + "epoch": 0.640457469621158, + "grad_norm": 0.4959709346294403, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.031, + "step": 22400 + }, + { + "epoch": 0.6407433881343817, + "grad_norm": 0.3765028715133667, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0406, + "step": 22410 + }, + { + "epoch": 0.6410293066476054, + "grad_norm": 0.5014662146568298, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0297, + "step": 22420 + }, + { + "epoch": 0.6413152251608292, + "grad_norm": 0.5085675716400146, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0324, + "step": 22430 + }, + { + "epoch": 0.6416011436740529, + "grad_norm": 0.37595826387405396, + "learning_rate": 4.854017257346105e-06, + "loss": 0.033, + "step": 22440 + }, + { + "epoch": 0.6418870621872766, + "grad_norm": 0.5408678650856018, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0323, + "step": 22450 + }, + { + "epoch": 0.6421729807005003, + "grad_norm": 0.4319652020931244, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0269, + "step": 22460 + }, + { + "epoch": 0.6424588992137241, + "grad_norm": 0.41388124227523804, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0311, + "step": 22470 + }, + { + "epoch": 0.6427448177269478, + "grad_norm": 0.4778555631637573, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0332, + "step": 22480 + }, + { + "epoch": 0.6430307362401716, + "grad_norm": 0.38835474848747253, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0304, + "step": 22490 + }, + { + "epoch": 0.6433166547533953, + "grad_norm": 0.5165611505508423, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0344, + "step": 22500 + }, + { + "epoch": 0.643602573266619, + "grad_norm": 0.4285198450088501, + "learning_rate": 4.804337352679613e-06, + "loss": 0.035, + "step": 22510 + }, + { + "epoch": 0.6438884917798428, + "grad_norm": 0.4512922167778015, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0354, + "step": 22520 + }, + { + "epoch": 0.6441744102930664, + "grad_norm": 0.33437663316726685, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0343, + "step": 22530 + }, + { + "epoch": 0.6444603288062902, + "grad_norm": 0.45291104912757874, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0307, + "step": 22540 + }, + { + "epoch": 0.6447462473195139, + "grad_norm": 0.5920093655586243, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0338, + "step": 22550 + }, + { + "epoch": 0.6450321658327377, + "grad_norm": 0.6362392902374268, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0335, + "step": 22560 + }, + { + "epoch": 0.6453180843459614, + "grad_norm": 0.28033652901649475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0267, + "step": 22570 + }, + { + "epoch": 0.6456040028591852, + "grad_norm": 0.4563148617744446, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0316, + "step": 22580 + }, + { + "epoch": 0.6458899213724089, + "grad_norm": 0.4889507591724396, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.034, + "step": 22590 + }, + { + "epoch": 0.6461758398856325, + "grad_norm": 0.6826061010360718, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0316, + "step": 22600 + }, + { + "epoch": 0.6464617583988563, + "grad_norm": 0.45066431164741516, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0287, + "step": 22610 + }, + { + "epoch": 0.64674767691208, + "grad_norm": 0.41994187235832214, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0312, + "step": 22620 + }, + { + "epoch": 0.6470335954253038, + "grad_norm": 0.39731675386428833, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0347, + "step": 22630 + }, + { + "epoch": 0.6473195139385275, + "grad_norm": 0.5207498073577881, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0304, + "step": 22640 + }, + { + "epoch": 0.6476054324517513, + "grad_norm": 0.42930668592453003, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0344, + "step": 22650 + }, + { + "epoch": 0.647891350964975, + "grad_norm": 0.3023674488067627, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0359, + "step": 22660 + }, + { + "epoch": 0.6481772694781988, + "grad_norm": 0.43205010890960693, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0323, + "step": 22670 + }, + { + "epoch": 0.6484631879914224, + "grad_norm": 0.5984707474708557, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0318, + "step": 22680 + }, + { + "epoch": 0.6487491065046461, + "grad_norm": 0.43477800488471985, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0346, + "step": 22690 + }, + { + "epoch": 0.6490350250178699, + "grad_norm": 0.3570900857448578, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0342, + "step": 22700 + }, + { + "epoch": 0.6493209435310936, + "grad_norm": 0.47367945313453674, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0367, + "step": 22710 + }, + { + "epoch": 0.6496068620443174, + "grad_norm": 0.3768099844455719, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0357, + "step": 22720 + }, + { + "epoch": 0.6498927805575411, + "grad_norm": 0.6188724040985107, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0299, + "step": 22730 + }, + { + "epoch": 0.6501786990707649, + "grad_norm": 0.5733038783073425, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0343, + "step": 22740 + }, + { + "epoch": 0.6504646175839885, + "grad_norm": 0.5000156164169312, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0284, + "step": 22750 + }, + { + "epoch": 0.6507505360972123, + "grad_norm": 0.22813546657562256, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0288, + "step": 22760 + }, + { + "epoch": 0.651036454610436, + "grad_norm": 0.4805088937282562, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0305, + "step": 22770 + }, + { + "epoch": 0.6513223731236597, + "grad_norm": 0.4652612507343292, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0304, + "step": 22780 + }, + { + "epoch": 0.6516082916368835, + "grad_norm": 0.5010579824447632, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0337, + "step": 22790 + }, + { + "epoch": 0.6518942101501072, + "grad_norm": 0.36260518431663513, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0284, + "step": 22800 + }, + { + "epoch": 0.652180128663331, + "grad_norm": 0.45098820328712463, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0297, + "step": 22810 + }, + { + "epoch": 0.6524660471765547, + "grad_norm": 0.6154504418373108, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0366, + "step": 22820 + }, + { + "epoch": 0.6527519656897784, + "grad_norm": 0.4522152543067932, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.033, + "step": 22830 + }, + { + "epoch": 0.6530378842030021, + "grad_norm": 0.34195253252983093, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0304, + "step": 22840 + }, + { + "epoch": 0.6533238027162259, + "grad_norm": 0.49787941575050354, + "learning_rate": 4.568154392147005e-06, + "loss": 0.033, + "step": 22850 + }, + { + "epoch": 0.6536097212294496, + "grad_norm": 0.5249335765838623, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0381, + "step": 22860 + }, + { + "epoch": 0.6538956397426733, + "grad_norm": 0.7645581960678101, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0298, + "step": 22870 + }, + { + "epoch": 0.6541815582558971, + "grad_norm": 0.6034232974052429, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0313, + "step": 22880 + }, + { + "epoch": 0.6544674767691208, + "grad_norm": 0.3499184846878052, + "learning_rate": 4.54093567906903e-06, + "loss": 0.036, + "step": 22890 + }, + { + "epoch": 0.6547533952823446, + "grad_norm": 0.4157135486602783, + "learning_rate": 4.534149931036931e-06, + "loss": 0.033, + "step": 22900 + }, + { + "epoch": 0.6550393137955682, + "grad_norm": 0.4563712775707245, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0361, + "step": 22910 + }, + { + "epoch": 0.655325232308792, + "grad_norm": 1.080802321434021, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0307, + "step": 22920 + }, + { + "epoch": 0.6556111508220157, + "grad_norm": 0.38259357213974, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0292, + "step": 22930 + }, + { + "epoch": 0.6558970693352395, + "grad_norm": 0.6920587420463562, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0322, + "step": 22940 + }, + { + "epoch": 0.6561829878484632, + "grad_norm": 0.628978967666626, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0391, + "step": 22950 + }, + { + "epoch": 0.6564689063616869, + "grad_norm": 0.4848436713218689, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0306, + "step": 22960 + }, + { + "epoch": 0.6567548248749107, + "grad_norm": 0.4478876292705536, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0334, + "step": 22970 + }, + { + "epoch": 0.6570407433881343, + "grad_norm": 0.47360673546791077, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0357, + "step": 22980 + }, + { + "epoch": 0.6573266619013581, + "grad_norm": 0.32840496301651, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0339, + "step": 22990 + }, + { + "epoch": 0.6576125804145818, + "grad_norm": 0.4047236442565918, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0321, + "step": 23000 + }, + { + "epoch": 0.6578984989278056, + "grad_norm": 0.7817053198814392, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0375, + "step": 23010 + }, + { + "epoch": 0.6581844174410293, + "grad_norm": 0.38985809683799744, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0343, + "step": 23020 + }, + { + "epoch": 0.6584703359542531, + "grad_norm": 0.45360830426216125, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0287, + "step": 23030 + }, + { + "epoch": 0.6587562544674768, + "grad_norm": 0.2886345088481903, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0322, + "step": 23040 + }, + { + "epoch": 0.6590421729807004, + "grad_norm": 0.8546258211135864, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0331, + "step": 23050 + }, + { + "epoch": 0.6593280914939242, + "grad_norm": 0.48426172137260437, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0343, + "step": 23060 + }, + { + "epoch": 0.6596140100071479, + "grad_norm": 0.46379074454307556, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0296, + "step": 23070 + }, + { + "epoch": 0.6598999285203717, + "grad_norm": 0.7772185206413269, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0319, + "step": 23080 + }, + { + "epoch": 0.6601858470335954, + "grad_norm": 0.4606277644634247, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0336, + "step": 23090 + }, + { + "epoch": 0.6604717655468192, + "grad_norm": 0.43342530727386475, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0287, + "step": 23100 + }, + { + "epoch": 0.6607576840600429, + "grad_norm": 0.385151207447052, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0363, + "step": 23110 + }, + { + "epoch": 0.6610436025732667, + "grad_norm": 0.3960207998752594, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0306, + "step": 23120 + }, + { + "epoch": 0.6613295210864903, + "grad_norm": 0.41210439801216125, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0348, + "step": 23130 + }, + { + "epoch": 0.661615439599714, + "grad_norm": 0.41976168751716614, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0272, + "step": 23140 + }, + { + "epoch": 0.6619013581129378, + "grad_norm": 0.3195948004722595, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0362, + "step": 23150 + }, + { + "epoch": 0.6621872766261615, + "grad_norm": 0.7024016380310059, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0316, + "step": 23160 + }, + { + "epoch": 0.6624731951393853, + "grad_norm": 0.2894183099269867, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0339, + "step": 23170 + }, + { + "epoch": 0.662759113652609, + "grad_norm": 0.489715576171875, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0272, + "step": 23180 + }, + { + "epoch": 0.6630450321658328, + "grad_norm": 0.3406641185283661, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0272, + "step": 23190 + }, + { + "epoch": 0.6633309506790565, + "grad_norm": 0.3647848963737488, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0337, + "step": 23200 + }, + { + "epoch": 0.6636168691922802, + "grad_norm": 0.7023333311080933, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0334, + "step": 23210 + }, + { + "epoch": 0.6639027877055039, + "grad_norm": 0.43989211320877075, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0313, + "step": 23220 + }, + { + "epoch": 0.6641887062187276, + "grad_norm": 0.7329099774360657, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0283, + "step": 23230 + }, + { + "epoch": 0.6644746247319514, + "grad_norm": 0.3954019546508789, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0321, + "step": 23240 + }, + { + "epoch": 0.6647605432451751, + "grad_norm": 0.38020703196525574, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0337, + "step": 23250 + }, + { + "epoch": 0.6650464617583989, + "grad_norm": 0.5988985300064087, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0353, + "step": 23260 + }, + { + "epoch": 0.6653323802716226, + "grad_norm": 0.4259869158267975, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0316, + "step": 23270 + }, + { + "epoch": 0.6656182987848464, + "grad_norm": 0.4322545528411865, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0318, + "step": 23280 + }, + { + "epoch": 0.66590421729807, + "grad_norm": 0.40275540947914124, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0344, + "step": 23290 + }, + { + "epoch": 0.6661901358112938, + "grad_norm": 0.5070827603340149, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0336, + "step": 23300 + }, + { + "epoch": 0.6664760543245175, + "grad_norm": 0.614973247051239, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0352, + "step": 23310 + }, + { + "epoch": 0.6667619728377412, + "grad_norm": 0.4637722074985504, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0277, + "step": 23320 + }, + { + "epoch": 0.667047891350965, + "grad_norm": 0.34951677918434143, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0284, + "step": 23330 + }, + { + "epoch": 0.6673338098641887, + "grad_norm": 0.5609407424926758, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0304, + "step": 23340 + }, + { + "epoch": 0.6676197283774125, + "grad_norm": 0.44585973024368286, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0263, + "step": 23350 + }, + { + "epoch": 0.6679056468906361, + "grad_norm": 0.5311269760131836, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0311, + "step": 23360 + }, + { + "epoch": 0.6681915654038599, + "grad_norm": 0.4923100471496582, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0277, + "step": 23370 + }, + { + "epoch": 0.6684774839170836, + "grad_norm": 0.5254819989204407, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0328, + "step": 23380 + }, + { + "epoch": 0.6687634024303074, + "grad_norm": 0.47537869215011597, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0373, + "step": 23390 + }, + { + "epoch": 0.6690493209435311, + "grad_norm": 0.40087464451789856, + "learning_rate": 4.204700678381975e-06, + "loss": 0.034, + "step": 23400 + }, + { + "epoch": 0.6693352394567548, + "grad_norm": 0.5166190266609192, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0314, + "step": 23410 + }, + { + "epoch": 0.6696211579699786, + "grad_norm": 0.42874693870544434, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0279, + "step": 23420 + }, + { + "epoch": 0.6699070764832022, + "grad_norm": 0.3685651123523712, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0313, + "step": 23430 + }, + { + "epoch": 0.670192994996426, + "grad_norm": 0.5417486429214478, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.033, + "step": 23440 + }, + { + "epoch": 0.6704789135096497, + "grad_norm": 0.5764726996421814, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0368, + "step": 23450 + }, + { + "epoch": 0.6707648320228735, + "grad_norm": 0.44168850779533386, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0258, + "step": 23460 + }, + { + "epoch": 0.6710507505360972, + "grad_norm": 0.39990919828414917, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0403, + "step": 23470 + }, + { + "epoch": 0.671336669049321, + "grad_norm": 0.7526253461837769, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0334, + "step": 23480 + }, + { + "epoch": 0.6716225875625447, + "grad_norm": 0.4888451397418976, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0314, + "step": 23490 + }, + { + "epoch": 0.6719085060757684, + "grad_norm": 0.5732892751693726, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0277, + "step": 23500 + }, + { + "epoch": 0.6721944245889921, + "grad_norm": 0.5806633830070496, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0347, + "step": 23510 + }, + { + "epoch": 0.6724803431022158, + "grad_norm": 0.4336501657962799, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0369, + "step": 23520 + }, + { + "epoch": 0.6727662616154396, + "grad_norm": 0.47082582116127014, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0408, + "step": 23530 + }, + { + "epoch": 0.6730521801286633, + "grad_norm": 0.6571422815322876, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0316, + "step": 23540 + }, + { + "epoch": 0.6733380986418871, + "grad_norm": 0.4899539649486542, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0256, + "step": 23550 + }, + { + "epoch": 0.6736240171551108, + "grad_norm": 0.3201868236064911, + "learning_rate": 4.103441847743051e-06, + "loss": 0.029, + "step": 23560 + }, + { + "epoch": 0.6739099356683346, + "grad_norm": 0.4385588765144348, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0284, + "step": 23570 + }, + { + "epoch": 0.6741958541815583, + "grad_norm": 0.5079174637794495, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0298, + "step": 23580 + }, + { + "epoch": 0.6744817726947819, + "grad_norm": 0.609523355960846, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0313, + "step": 23590 + }, + { + "epoch": 0.6747676912080057, + "grad_norm": 0.487690269947052, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0246, + "step": 23600 + }, + { + "epoch": 0.6750536097212294, + "grad_norm": 0.5146880745887756, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0319, + "step": 23610 + }, + { + "epoch": 0.6753395282344532, + "grad_norm": 0.5848239064216614, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0282, + "step": 23620 + }, + { + "epoch": 0.6756254467476769, + "grad_norm": 0.7779616117477417, + "learning_rate": 4.05979084812184e-06, + "loss": 0.033, + "step": 23630 + }, + { + "epoch": 0.6759113652609007, + "grad_norm": 0.3329331576824188, + "learning_rate": 4.053587511509546e-06, + "loss": 0.028, + "step": 23640 + }, + { + "epoch": 0.6761972837741244, + "grad_norm": 0.4691336154937744, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0313, + "step": 23650 + }, + { + "epoch": 0.6764832022873482, + "grad_norm": 0.47258421778678894, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0326, + "step": 23660 + }, + { + "epoch": 0.6767691208005718, + "grad_norm": 0.5333718657493591, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0332, + "step": 23670 + }, + { + "epoch": 0.6770550393137955, + "grad_norm": 0.7278451323509216, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0409, + "step": 23680 + }, + { + "epoch": 0.6773409578270193, + "grad_norm": 0.41567277908325195, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0263, + "step": 23690 + }, + { + "epoch": 0.677626876340243, + "grad_norm": 0.4351106584072113, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0276, + "step": 23700 + }, + { + "epoch": 0.6779127948534668, + "grad_norm": 0.31096217036247253, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0339, + "step": 23710 + }, + { + "epoch": 0.6781987133666905, + "grad_norm": 0.6321837306022644, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0313, + "step": 23720 + }, + { + "epoch": 0.6784846318799143, + "grad_norm": 0.5278098583221436, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0294, + "step": 23730 + }, + { + "epoch": 0.6787705503931379, + "grad_norm": 0.5778757333755493, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0324, + "step": 23740 + }, + { + "epoch": 0.6790564689063617, + "grad_norm": 0.6164223551750183, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0316, + "step": 23750 + }, + { + "epoch": 0.6793423874195854, + "grad_norm": 0.2872319221496582, + "learning_rate": 3.979785400791052e-06, + "loss": 0.034, + "step": 23760 + }, + { + "epoch": 0.6796283059328091, + "grad_norm": 0.6088704466819763, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0317, + "step": 23770 + }, + { + "epoch": 0.6799142244460329, + "grad_norm": 0.4733040928840637, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0282, + "step": 23780 + }, + { + "epoch": 0.6802001429592566, + "grad_norm": 1.3417131900787354, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0304, + "step": 23790 + }, + { + "epoch": 0.6804860614724804, + "grad_norm": 0.7316146492958069, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0311, + "step": 23800 + }, + { + "epoch": 0.680771979985704, + "grad_norm": 0.5726248025894165, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0323, + "step": 23810 + }, + { + "epoch": 0.6810578984989278, + "grad_norm": 0.3990941345691681, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0277, + "step": 23820 + }, + { + "epoch": 0.6813438170121515, + "grad_norm": 0.49237731099128723, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0287, + "step": 23830 + }, + { + "epoch": 0.6816297355253753, + "grad_norm": 0.47560542821884155, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0298, + "step": 23840 + }, + { + "epoch": 0.681915654038599, + "grad_norm": 0.5967867374420166, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0295, + "step": 23850 + }, + { + "epoch": 0.6822015725518227, + "grad_norm": 0.5726722478866577, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0283, + "step": 23860 + }, + { + "epoch": 0.6824874910650465, + "grad_norm": 0.282678484916687, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0303, + "step": 23870 + }, + { + "epoch": 0.6827734095782702, + "grad_norm": 0.4432118237018585, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0296, + "step": 23880 + }, + { + "epoch": 0.683059328091494, + "grad_norm": 0.33677008748054504, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0379, + "step": 23890 + }, + { + "epoch": 0.6833452466047176, + "grad_norm": 0.5063587427139282, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0281, + "step": 23900 + }, + { + "epoch": 0.6836311651179414, + "grad_norm": 0.2592383921146393, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0263, + "step": 23910 + }, + { + "epoch": 0.6839170836311651, + "grad_norm": 0.4482796788215637, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0289, + "step": 23920 + }, + { + "epoch": 0.6842030021443889, + "grad_norm": 0.2609167993068695, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0294, + "step": 23930 + }, + { + "epoch": 0.6844889206576126, + "grad_norm": 0.36982619762420654, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0306, + "step": 23940 + }, + { + "epoch": 0.6847748391708363, + "grad_norm": 0.47758495807647705, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0273, + "step": 23950 + }, + { + "epoch": 0.68506075768406, + "grad_norm": 0.5566948652267456, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0318, + "step": 23960 + }, + { + "epoch": 0.6853466761972837, + "grad_norm": 0.7815461754798889, + "learning_rate": 3.853493736024934e-06, + "loss": 0.03, + "step": 23970 + }, + { + "epoch": 0.6856325947105075, + "grad_norm": 0.42888402938842773, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0384, + "step": 23980 + }, + { + "epoch": 0.6859185132237312, + "grad_norm": 0.47878748178482056, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0356, + "step": 23990 + }, + { + "epoch": 0.686204431736955, + "grad_norm": 0.3847522735595703, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0272, + "step": 24000 + }, + { + "epoch": 0.6864903502501787, + "grad_norm": 0.7005330920219421, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0314, + "step": 24010 + }, + { + "epoch": 0.6867762687634025, + "grad_norm": 0.7769733667373657, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0306, + "step": 24020 + }, + { + "epoch": 0.6870621872766262, + "grad_norm": 0.4073965847492218, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0286, + "step": 24030 + }, + { + "epoch": 0.6873481057898498, + "grad_norm": 0.6220553517341614, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0358, + "step": 24040 + }, + { + "epoch": 0.6876340243030736, + "grad_norm": 0.32508641481399536, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0284, + "step": 24050 + }, + { + "epoch": 0.6879199428162973, + "grad_norm": 0.4828036427497864, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0308, + "step": 24060 + }, + { + "epoch": 0.6882058613295211, + "grad_norm": 0.4809496998786926, + "learning_rate": 3.794650811106129e-06, + "loss": 0.028, + "step": 24070 + }, + { + "epoch": 0.6884917798427448, + "grad_norm": 0.8497998714447021, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.037, + "step": 24080 + }, + { + "epoch": 0.6887776983559686, + "grad_norm": 0.758666455745697, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0323, + "step": 24090 + }, + { + "epoch": 0.6890636168691923, + "grad_norm": 0.40550050139427185, + "learning_rate": 3.777162510056721e-06, + "loss": 0.0359, + "step": 24100 + }, + { + "epoch": 0.6893495353824161, + "grad_norm": 0.4595869779586792, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0304, + "step": 24110 + }, + { + "epoch": 0.6896354538956397, + "grad_norm": 0.5098794102668762, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0279, + "step": 24120 + }, + { + "epoch": 0.6899213724088634, + "grad_norm": 0.3320889174938202, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0287, + "step": 24130 + }, + { + "epoch": 0.6902072909220872, + "grad_norm": 0.4708438515663147, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0289, + "step": 24140 + }, + { + "epoch": 0.6904932094353109, + "grad_norm": 1.0990219116210938, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0303, + "step": 24150 + }, + { + "epoch": 0.6907791279485347, + "grad_norm": 0.5109107494354248, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0306, + "step": 24160 + }, + { + "epoch": 0.6910650464617584, + "grad_norm": 0.6247434616088867, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0332, + "step": 24170 + }, + { + "epoch": 0.6913509649749822, + "grad_norm": 0.4033079743385315, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0252, + "step": 24180 + }, + { + "epoch": 0.6916368834882058, + "grad_norm": 0.36993420124053955, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0287, + "step": 24190 + }, + { + "epoch": 0.6919228020014296, + "grad_norm": 0.37320762872695923, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0364, + "step": 24200 + }, + { + "epoch": 0.6922087205146533, + "grad_norm": 0.6411201357841492, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0306, + "step": 24210 + }, + { + "epoch": 0.692494639027877, + "grad_norm": 0.7033433318138123, + "learning_rate": 3.707974016467e-06, + "loss": 0.0334, + "step": 24220 + }, + { + "epoch": 0.6927805575411008, + "grad_norm": 0.5307570695877075, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0338, + "step": 24230 + }, + { + "epoch": 0.6930664760543245, + "grad_norm": 0.6726395487785339, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0379, + "step": 24240 + }, + { + "epoch": 0.6933523945675483, + "grad_norm": 0.5609936714172363, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0272, + "step": 24250 + }, + { + "epoch": 0.693638313080772, + "grad_norm": 0.5961005687713623, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0361, + "step": 24260 + }, + { + "epoch": 0.6939242315939957, + "grad_norm": 0.46744176745414734, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0291, + "step": 24270 + }, + { + "epoch": 0.6942101501072194, + "grad_norm": 0.5180732607841492, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0377, + "step": 24280 + }, + { + "epoch": 0.6944960686204432, + "grad_norm": 0.594201922416687, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0312, + "step": 24290 + }, + { + "epoch": 0.6947819871336669, + "grad_norm": 0.5852509140968323, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0303, + "step": 24300 + }, + { + "epoch": 0.6950679056468906, + "grad_norm": 0.7885274291038513, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0329, + "step": 24310 + }, + { + "epoch": 0.6953538241601144, + "grad_norm": 0.5280163884162903, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.031, + "step": 24320 + }, + { + "epoch": 0.6956397426733381, + "grad_norm": 0.6047127842903137, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0283, + "step": 24330 + }, + { + "epoch": 0.6959256611865619, + "grad_norm": 0.43192219734191895, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0338, + "step": 24340 + }, + { + "epoch": 0.6962115796997855, + "grad_norm": 0.3320246636867523, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0262, + "step": 24350 + }, + { + "epoch": 0.6964974982130093, + "grad_norm": 0.46365252137184143, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0342, + "step": 24360 + }, + { + "epoch": 0.696783416726233, + "grad_norm": 0.537933886051178, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0286, + "step": 24370 + }, + { + "epoch": 0.6970693352394568, + "grad_norm": 0.3574221134185791, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0342, + "step": 24380 + }, + { + "epoch": 0.6973552537526805, + "grad_norm": 0.7051029205322266, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0316, + "step": 24390 + }, + { + "epoch": 0.6976411722659042, + "grad_norm": 0.587533712387085, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0302, + "step": 24400 + }, + { + "epoch": 0.697927090779128, + "grad_norm": 0.555778980255127, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0301, + "step": 24410 + }, + { + "epoch": 0.6982130092923516, + "grad_norm": 0.44060736894607544, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0296, + "step": 24420 + }, + { + "epoch": 0.6984989278055754, + "grad_norm": 0.3930843472480774, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0327, + "step": 24430 + }, + { + "epoch": 0.6987848463187991, + "grad_norm": 0.8878913521766663, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0389, + "step": 24440 + }, + { + "epoch": 0.6990707648320229, + "grad_norm": 0.45810988545417786, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0322, + "step": 24450 + }, + { + "epoch": 0.6993566833452466, + "grad_norm": 0.41808775067329407, + "learning_rate": 3.573305344104808e-06, + "loss": 0.032, + "step": 24460 + }, + { + "epoch": 0.6996426018584704, + "grad_norm": 0.5060444474220276, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0317, + "step": 24470 + }, + { + "epoch": 0.6999285203716941, + "grad_norm": 0.28741514682769775, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0271, + "step": 24480 + }, + { + "epoch": 0.7002144388849177, + "grad_norm": 0.5564437508583069, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0279, + "step": 24490 + }, + { + "epoch": 0.7005003573981415, + "grad_norm": 0.43762925267219543, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0317, + "step": 24500 + }, + { + "epoch": 0.7007862759113652, + "grad_norm": 0.46590355038642883, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0314, + "step": 24510 + }, + { + "epoch": 0.701072194424589, + "grad_norm": 0.640477180480957, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0258, + "step": 24520 + }, + { + "epoch": 0.7013581129378127, + "grad_norm": 0.5845742225646973, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0283, + "step": 24530 + }, + { + "epoch": 0.7016440314510365, + "grad_norm": 0.5625128746032715, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0381, + "step": 24540 + }, + { + "epoch": 0.7019299499642602, + "grad_norm": 0.4365232586860657, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0297, + "step": 24550 + }, + { + "epoch": 0.702215868477484, + "grad_norm": 0.5942055583000183, + "learning_rate": 3.518669865884119e-06, + "loss": 0.034, + "step": 24560 + }, + { + "epoch": 0.7025017869907076, + "grad_norm": 0.3847256302833557, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0293, + "step": 24570 + }, + { + "epoch": 0.7027877055039313, + "grad_norm": 0.542539119720459, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0327, + "step": 24580 + }, + { + "epoch": 0.7030736240171551, + "grad_norm": 0.5383610129356384, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0322, + "step": 24590 + }, + { + "epoch": 0.7033595425303788, + "grad_norm": 0.6085273027420044, + "learning_rate": 3.497061149826966e-06, + "loss": 0.0293, + "step": 24600 + }, + { + "epoch": 0.7036454610436026, + "grad_norm": 0.5107666254043579, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0266, + "step": 24610 + }, + { + "epoch": 0.7039313795568263, + "grad_norm": 0.4976873993873596, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0301, + "step": 24620 + }, + { + "epoch": 0.7042172980700501, + "grad_norm": 0.5735257863998413, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0264, + "step": 24630 + }, + { + "epoch": 0.7045032165832738, + "grad_norm": 0.6035013794898987, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0286, + "step": 24640 + }, + { + "epoch": 0.7047891350964975, + "grad_norm": 0.5665635466575623, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0345, + "step": 24650 + }, + { + "epoch": 0.7050750536097212, + "grad_norm": 0.5783578753471375, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0385, + "step": 24660 + }, + { + "epoch": 0.7053609721229449, + "grad_norm": 0.3957138657569885, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.0319, + "step": 24670 + }, + { + "epoch": 0.7056468906361687, + "grad_norm": 0.32982495427131653, + "learning_rate": 3.454266765790622e-06, + "loss": 0.034, + "step": 24680 + }, + { + "epoch": 0.7059328091493924, + "grad_norm": 0.5827629566192627, + "learning_rate": 3.448957251110008e-06, + "loss": 0.029, + "step": 24690 + }, + { + "epoch": 0.7062187276626162, + "grad_norm": 0.28891173005104065, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0328, + "step": 24700 + }, + { + "epoch": 0.7065046461758399, + "grad_norm": 0.7992371320724487, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0333, + "step": 24710 + }, + { + "epoch": 0.7067905646890636, + "grad_norm": 0.5976162552833557, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0327, + "step": 24720 + }, + { + "epoch": 0.7070764832022873, + "grad_norm": 0.4785068929195404, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0334, + "step": 24730 + }, + { + "epoch": 0.7073624017155111, + "grad_norm": 0.6561854481697083, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0317, + "step": 24740 + }, + { + "epoch": 0.7076483202287348, + "grad_norm": 0.6745696067810059, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0289, + "step": 24750 + }, + { + "epoch": 0.7079342387419585, + "grad_norm": 0.4914945960044861, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0306, + "step": 24760 + }, + { + "epoch": 0.7082201572551823, + "grad_norm": 0.35789182782173157, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0327, + "step": 24770 + }, + { + "epoch": 0.708506075768406, + "grad_norm": 0.416161447763443, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0313, + "step": 24780 + }, + { + "epoch": 0.7087919942816298, + "grad_norm": 0.6271718740463257, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0314, + "step": 24790 + }, + { + "epoch": 0.7090779127948534, + "grad_norm": 0.5230259895324707, + "learning_rate": 3.391138816571675e-06, + "loss": 0.037, + "step": 24800 + }, + { + "epoch": 0.7093638313080772, + "grad_norm": 0.54779452085495, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0364, + "step": 24810 + }, + { + "epoch": 0.7096497498213009, + "grad_norm": 0.6326698064804077, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0287, + "step": 24820 + }, + { + "epoch": 0.7099356683345247, + "grad_norm": 0.576437771320343, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0321, + "step": 24830 + }, + { + "epoch": 0.7102215868477484, + "grad_norm": 0.49094530940055847, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0328, + "step": 24840 + }, + { + "epoch": 0.7105075053609721, + "grad_norm": 3.1826400756835938, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0497, + "step": 24850 + }, + { + "epoch": 0.7107934238741959, + "grad_norm": 0.6048339009284973, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0429, + "step": 24860 + }, + { + "epoch": 0.7110793423874195, + "grad_norm": 0.6633393168449402, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0287, + "step": 24870 + }, + { + "epoch": 0.7113652609006433, + "grad_norm": 0.24930168688297272, + "learning_rate": 3.349767211300933e-06, + "loss": 0.027, + "step": 24880 + }, + { + "epoch": 0.711651179413867, + "grad_norm": 0.3934503495693207, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0256, + "step": 24890 + }, + { + "epoch": 0.7119370979270908, + "grad_norm": 0.7811068892478943, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.03, + "step": 24900 + }, + { + "epoch": 0.7122230164403145, + "grad_norm": 0.4274163246154785, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0263, + "step": 24910 + }, + { + "epoch": 0.7125089349535383, + "grad_norm": 0.5188158750534058, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0264, + "step": 24920 + }, + { + "epoch": 0.712794853466762, + "grad_norm": 0.4106016457080841, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0309, + "step": 24930 + }, + { + "epoch": 0.7130807719799857, + "grad_norm": 0.5283434987068176, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0335, + "step": 24940 + }, + { + "epoch": 0.7133666904932094, + "grad_norm": 0.38160789012908936, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0313, + "step": 24950 + }, + { + "epoch": 0.7136526090064331, + "grad_norm": 0.30552029609680176, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0265, + "step": 24960 + }, + { + "epoch": 0.7139385275196569, + "grad_norm": 0.40023618936538696, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0295, + "step": 24970 + }, + { + "epoch": 0.7142244460328806, + "grad_norm": 0.3569220006465912, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0259, + "step": 24980 + }, + { + "epoch": 0.7145103645461044, + "grad_norm": 0.39430442452430725, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0313, + "step": 24990 + }, + { + "epoch": 0.7147962830593281, + "grad_norm": 0.5891808271408081, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0272, + "step": 25000 + }, + { + "epoch": 0.7150822015725519, + "grad_norm": 0.487945556640625, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0308, + "step": 25010 + }, + { + "epoch": 0.7153681200857755, + "grad_norm": 0.551268458366394, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.026, + "step": 25020 + }, + { + "epoch": 0.7156540385989992, + "grad_norm": 0.7384896278381348, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0371, + "step": 25030 + }, + { + "epoch": 0.715939957112223, + "grad_norm": 0.43013718724250793, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0348, + "step": 25040 + }, + { + "epoch": 0.7162258756254467, + "grad_norm": 0.28747591376304626, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0249, + "step": 25050 + }, + { + "epoch": 0.7165117941386705, + "grad_norm": 0.48107975721359253, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0247, + "step": 25060 + }, + { + "epoch": 0.7167977126518942, + "grad_norm": 0.4077073931694031, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0313, + "step": 25070 + }, + { + "epoch": 0.717083631165118, + "grad_norm": 0.7853788137435913, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0286, + "step": 25080 + }, + { + "epoch": 0.7173695496783417, + "grad_norm": 0.6021899580955505, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0316, + "step": 25090 + }, + { + "epoch": 0.7176554681915654, + "grad_norm": 0.5997788906097412, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0286, + "step": 25100 + }, + { + "epoch": 0.7179413867047891, + "grad_norm": 0.47682714462280273, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0329, + "step": 25110 + }, + { + "epoch": 0.7182273052180128, + "grad_norm": 0.6501848697662354, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0289, + "step": 25120 + }, + { + "epoch": 0.7185132237312366, + "grad_norm": 1.000689148902893, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0351, + "step": 25130 + }, + { + "epoch": 0.7187991422444603, + "grad_norm": 0.5946705937385559, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0268, + "step": 25140 + }, + { + "epoch": 0.7190850607576841, + "grad_norm": 0.46967631578445435, + "learning_rate": 3.214397932123149e-06, + "loss": 0.031, + "step": 25150 + }, + { + "epoch": 0.7193709792709078, + "grad_norm": 1.052093744277954, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0302, + "step": 25160 + }, + { + "epoch": 0.7196568977841316, + "grad_norm": 0.9337649941444397, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0304, + "step": 25170 + }, + { + "epoch": 0.7199428162973552, + "grad_norm": 0.423648864030838, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0297, + "step": 25180 + }, + { + "epoch": 0.720228734810579, + "grad_norm": 0.46862924098968506, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.028, + "step": 25190 + }, + { + "epoch": 0.7205146533238027, + "grad_norm": 0.7099304795265198, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0306, + "step": 25200 + }, + { + "epoch": 0.7208005718370264, + "grad_norm": 0.5219885110855103, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0269, + "step": 25210 + }, + { + "epoch": 0.7210864903502502, + "grad_norm": 0.6347305774688721, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0307, + "step": 25220 + }, + { + "epoch": 0.7213724088634739, + "grad_norm": 0.7043943405151367, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0267, + "step": 25230 + }, + { + "epoch": 0.7216583273766977, + "grad_norm": 0.4137915074825287, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.028, + "step": 25240 + }, + { + "epoch": 0.7219442458899213, + "grad_norm": 0.4374844431877136, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0246, + "step": 25250 + }, + { + "epoch": 0.7222301644031451, + "grad_norm": 0.6796316504478455, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0285, + "step": 25260 + }, + { + "epoch": 0.7225160829163688, + "grad_norm": 0.4662792980670929, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0287, + "step": 25270 + }, + { + "epoch": 0.7228020014295926, + "grad_norm": 0.4035339653491974, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0289, + "step": 25280 + }, + { + "epoch": 0.7230879199428163, + "grad_norm": 0.40217533707618713, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0238, + "step": 25290 + }, + { + "epoch": 0.72337383845604, + "grad_norm": 0.3640667796134949, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0303, + "step": 25300 + }, + { + "epoch": 0.7236597569692638, + "grad_norm": 0.38176655769348145, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.0283, + "step": 25310 + }, + { + "epoch": 0.7239456754824874, + "grad_norm": 0.40747207403182983, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.031, + "step": 25320 + }, + { + "epoch": 0.7242315939957112, + "grad_norm": 0.3859431743621826, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0306, + "step": 25330 + }, + { + "epoch": 0.7245175125089349, + "grad_norm": 0.23738636076450348, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0313, + "step": 25340 + }, + { + "epoch": 0.7248034310221587, + "grad_norm": 0.3772980272769928, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0308, + "step": 25350 + }, + { + "epoch": 0.7250893495353824, + "grad_norm": 0.5451138019561768, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.03, + "step": 25360 + }, + { + "epoch": 0.7253752680486062, + "grad_norm": 0.6431843638420105, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0371, + "step": 25370 + }, + { + "epoch": 0.7256611865618299, + "grad_norm": 0.42552369832992554, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0311, + "step": 25380 + }, + { + "epoch": 0.7259471050750536, + "grad_norm": 0.5802433490753174, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0316, + "step": 25390 + }, + { + "epoch": 0.7262330235882773, + "grad_norm": 0.31489041447639465, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0282, + "step": 25400 + }, + { + "epoch": 0.726518942101501, + "grad_norm": 0.4227478504180908, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0274, + "step": 25410 + }, + { + "epoch": 0.7268048606147248, + "grad_norm": 0.5510851740837097, + "learning_rate": 3.085688933413021e-06, + "loss": 0.0297, + "step": 25420 + }, + { + "epoch": 0.7270907791279485, + "grad_norm": 0.3073323667049408, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0244, + "step": 25430 + }, + { + "epoch": 0.7273766976411723, + "grad_norm": 0.7394781112670898, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.028, + "step": 25440 + }, + { + "epoch": 0.727662616154396, + "grad_norm": 0.5067957639694214, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0374, + "step": 25450 + }, + { + "epoch": 0.7279485346676198, + "grad_norm": 0.4093882739543915, + "learning_rate": 3.067194157156521e-06, + "loss": 0.0347, + "step": 25460 + }, + { + "epoch": 0.7282344531808435, + "grad_norm": 0.37054866552352905, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0305, + "step": 25470 + }, + { + "epoch": 0.7285203716940671, + "grad_norm": 0.38795027136802673, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0282, + "step": 25480 + }, + { + "epoch": 0.7288062902072909, + "grad_norm": 0.49282407760620117, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0301, + "step": 25490 + }, + { + "epoch": 0.7290922087205146, + "grad_norm": 0.5234564542770386, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0352, + "step": 25500 + }, + { + "epoch": 0.7293781272337384, + "grad_norm": 0.5383297801017761, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0317, + "step": 25510 + }, + { + "epoch": 0.7296640457469621, + "grad_norm": 0.4277333617210388, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0278, + "step": 25520 + }, + { + "epoch": 0.7299499642601859, + "grad_norm": 0.6099430322647095, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0356, + "step": 25530 + }, + { + "epoch": 0.7302358827734096, + "grad_norm": 0.38870710134506226, + "learning_rate": 3.030651808761638e-06, + "loss": 0.027, + "step": 25540 + }, + { + "epoch": 0.7305218012866334, + "grad_norm": 0.48884090781211853, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0251, + "step": 25550 + }, + { + "epoch": 0.730807719799857, + "grad_norm": 0.5136672258377075, + "learning_rate": 3.021609639602321e-06, + "loss": 0.025, + "step": 25560 + }, + { + "epoch": 0.7310936383130807, + "grad_norm": 0.527056872844696, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.03, + "step": 25570 + }, + { + "epoch": 0.7313795568263045, + "grad_norm": 0.7081360220909119, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0303, + "step": 25580 + }, + { + "epoch": 0.7316654753395282, + "grad_norm": 0.48397257924079895, + "learning_rate": 3.008116622200155e-06, + "loss": 0.032, + "step": 25590 + }, + { + "epoch": 0.731951393852752, + "grad_norm": 0.38431495428085327, + "learning_rate": 3.003637700546652e-06, + "loss": 0.0337, + "step": 25600 + }, + { + "epoch": 0.7322373123659757, + "grad_norm": 0.48320460319519043, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0336, + "step": 25610 + }, + { + "epoch": 0.7325232308791995, + "grad_norm": 0.3164500892162323, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0244, + "step": 25620 + }, + { + "epoch": 0.7328091493924231, + "grad_norm": 0.5140587091445923, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0267, + "step": 25630 + }, + { + "epoch": 0.7330950679056469, + "grad_norm": 0.30739104747772217, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0373, + "step": 25640 + }, + { + "epoch": 0.7333809864188706, + "grad_norm": 0.3579956591129303, + "learning_rate": 2.981383959667165e-06, + "loss": 0.0328, + "step": 25650 + }, + { + "epoch": 0.7336669049320943, + "grad_norm": 0.7733256220817566, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0335, + "step": 25660 + }, + { + "epoch": 0.7339528234453181, + "grad_norm": 0.5355008244514465, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0291, + "step": 25670 + }, + { + "epoch": 0.7342387419585418, + "grad_norm": 0.5733621120452881, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0223, + "step": 25680 + }, + { + "epoch": 0.7345246604717656, + "grad_norm": 0.4484233260154724, + "learning_rate": 2.963750320724704e-06, + "loss": 0.03, + "step": 25690 + }, + { + "epoch": 0.7348105789849892, + "grad_norm": 0.46975597739219666, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0325, + "step": 25700 + }, + { + "epoch": 0.735096497498213, + "grad_norm": 0.4674699008464813, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0321, + "step": 25710 + }, + { + "epoch": 0.7353824160114367, + "grad_norm": 0.301565557718277, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0279, + "step": 25720 + }, + { + "epoch": 0.7356683345246605, + "grad_norm": 0.41966041922569275, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0319, + "step": 25730 + }, + { + "epoch": 0.7359542530378842, + "grad_norm": 0.5388277173042297, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0287, + "step": 25740 + }, + { + "epoch": 0.7362401715511079, + "grad_norm": 0.5821589231491089, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0298, + "step": 25750 + }, + { + "epoch": 0.7365260900643317, + "grad_norm": 0.9340733289718628, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0307, + "step": 25760 + }, + { + "epoch": 0.7368120085775554, + "grad_norm": 0.3654371201992035, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0284, + "step": 25770 + }, + { + "epoch": 0.7370979270907791, + "grad_norm": 0.38794293999671936, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0306, + "step": 25780 + }, + { + "epoch": 0.7373838456040028, + "grad_norm": 0.39955422282218933, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.0324, + "step": 25790 + }, + { + "epoch": 0.7376697641172266, + "grad_norm": 0.5864313244819641, + "learning_rate": 2.916036854664115e-06, + "loss": 0.031, + "step": 25800 + }, + { + "epoch": 0.7379556826304503, + "grad_norm": 0.4324203431606293, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0253, + "step": 25810 + }, + { + "epoch": 0.7382416011436741, + "grad_norm": 0.6346203684806824, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0278, + "step": 25820 + }, + { + "epoch": 0.7385275196568978, + "grad_norm": 0.3984649181365967, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0352, + "step": 25830 + }, + { + "epoch": 0.7388134381701215, + "grad_norm": 0.3954542577266693, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0305, + "step": 25840 + }, + { + "epoch": 0.7390993566833453, + "grad_norm": 0.3119542598724365, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0372, + "step": 25850 + }, + { + "epoch": 0.7393852751965689, + "grad_norm": 0.4094623029232025, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0251, + "step": 25860 + }, + { + "epoch": 0.7396711937097927, + "grad_norm": 0.5250104665756226, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0302, + "step": 25870 + }, + { + "epoch": 0.7399571122230164, + "grad_norm": 0.7610230445861816, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0257, + "step": 25880 + }, + { + "epoch": 0.7402430307362402, + "grad_norm": 0.5546014904975891, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0249, + "step": 25890 + }, + { + "epoch": 0.7405289492494639, + "grad_norm": 0.22835634648799896, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0287, + "step": 25900 + }, + { + "epoch": 0.7408148677626877, + "grad_norm": 0.7073826789855957, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0409, + "step": 25910 + }, + { + "epoch": 0.7411007862759114, + "grad_norm": 0.604634165763855, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0293, + "step": 25920 + }, + { + "epoch": 0.741386704789135, + "grad_norm": 0.46605581045150757, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0261, + "step": 25930 + }, + { + "epoch": 0.7416726233023588, + "grad_norm": 0.35719090700149536, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0304, + "step": 25940 + }, + { + "epoch": 0.7419585418155825, + "grad_norm": 0.3806651532649994, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0313, + "step": 25950 + }, + { + "epoch": 0.7422444603288063, + "grad_norm": 0.6443240642547607, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0303, + "step": 25960 + }, + { + "epoch": 0.74253037884203, + "grad_norm": 0.42187514901161194, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0282, + "step": 25970 + }, + { + "epoch": 0.7428162973552538, + "grad_norm": 0.4213440418243408, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0312, + "step": 25980 + }, + { + "epoch": 0.7431022158684775, + "grad_norm": 0.3982003331184387, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0279, + "step": 25990 + }, + { + "epoch": 0.7433881343817013, + "grad_norm": 0.3418596386909485, + "learning_rate": 2.832230653119002e-06, + "loss": 0.0318, + "step": 26000 + }, + { + "epoch": 0.7436740528949249, + "grad_norm": 0.3633996844291687, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0301, + "step": 26010 + }, + { + "epoch": 0.7439599714081486, + "grad_norm": 0.362079918384552, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.028, + "step": 26020 + }, + { + "epoch": 0.7442458899213724, + "grad_norm": 0.4734862744808197, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.031, + "step": 26030 + }, + { + "epoch": 0.7445318084345961, + "grad_norm": 0.31540775299072266, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0287, + "step": 26040 + }, + { + "epoch": 0.7448177269478199, + "grad_norm": 0.6774418950080872, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.029, + "step": 26050 + }, + { + "epoch": 0.7451036454610436, + "grad_norm": 0.3063428997993469, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0308, + "step": 26060 + }, + { + "epoch": 0.7453895639742674, + "grad_norm": 0.691943347454071, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.0265, + "step": 26070 + }, + { + "epoch": 0.745675482487491, + "grad_norm": 0.5507379174232483, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0286, + "step": 26080 + }, + { + "epoch": 0.7459614010007148, + "grad_norm": 0.34355828166007996, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.024, + "step": 26090 + }, + { + "epoch": 0.7462473195139385, + "grad_norm": 0.5120819807052612, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0288, + "step": 26100 + }, + { + "epoch": 0.7465332380271622, + "grad_norm": 0.5197821259498596, + "learning_rate": 2.78776903555923e-06, + "loss": 0.028, + "step": 26110 + }, + { + "epoch": 0.746819156540386, + "grad_norm": 0.46328091621398926, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.0247, + "step": 26120 + }, + { + "epoch": 0.7471050750536097, + "grad_norm": 0.6205909848213196, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0306, + "step": 26130 + }, + { + "epoch": 0.7473909935668335, + "grad_norm": 0.4201740622520447, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.025, + "step": 26140 + }, + { + "epoch": 0.7476769120800572, + "grad_norm": 0.23724111914634705, + "learning_rate": 2.771889969647e-06, + "loss": 0.0283, + "step": 26150 + }, + { + "epoch": 0.747962830593281, + "grad_norm": 0.8046770691871643, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0318, + "step": 26160 + }, + { + "epoch": 0.7482487491065046, + "grad_norm": 0.5273832082748413, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0342, + "step": 26170 + }, + { + "epoch": 0.7485346676197284, + "grad_norm": 0.923651397228241, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0255, + "step": 26180 + }, + { + "epoch": 0.7488205861329521, + "grad_norm": 0.6395840644836426, + "learning_rate": 2.756165401834933e-06, + "loss": 0.0277, + "step": 26190 + }, + { + "epoch": 0.7491065046461758, + "grad_norm": 0.44334620237350464, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.0285, + "step": 26200 + }, + { + "epoch": 0.7493924231593996, + "grad_norm": 0.47904232144355774, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0282, + "step": 26210 + }, + { + "epoch": 0.7496783416726233, + "grad_norm": 0.9316203594207764, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0307, + "step": 26220 + }, + { + "epoch": 0.749964260185847, + "grad_norm": 0.5045170783996582, + "learning_rate": 2.740595627381096e-06, + "loss": 0.0242, + "step": 26230 + }, + { + "epoch": 0.7502501786990707, + "grad_norm": 0.54493248462677, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0263, + "step": 26240 + }, + { + "epoch": 0.7505360972122945, + "grad_norm": 0.6128116846084595, + "learning_rate": 2.732868879137055e-06, + "loss": 0.0305, + "step": 26250 + }, + { + "epoch": 0.7508220157255182, + "grad_norm": 0.6235067844390869, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.037, + "step": 26260 + }, + { + "epoch": 0.751107934238742, + "grad_norm": 0.43458008766174316, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0274, + "step": 26270 + }, + { + "epoch": 0.7513938527519657, + "grad_norm": 0.5540400147438049, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0289, + "step": 26280 + }, + { + "epoch": 0.7516797712651894, + "grad_norm": 0.4317619204521179, + "learning_rate": 2.717531841969889e-06, + "loss": 0.0313, + "step": 26290 + }, + { + "epoch": 0.7519656897784132, + "grad_norm": 0.42271071672439575, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0291, + "step": 26300 + }, + { + "epoch": 0.7522516082916368, + "grad_norm": 0.6096150875091553, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0374, + "step": 26310 + }, + { + "epoch": 0.7525375268048606, + "grad_norm": 0.5820568799972534, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.027, + "step": 26320 + }, + { + "epoch": 0.7528234453180843, + "grad_norm": 0.4441884756088257, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0258, + "step": 26330 + }, + { + "epoch": 0.7531093638313081, + "grad_norm": 0.48442211747169495, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.0257, + "step": 26340 + }, + { + "epoch": 0.7533952823445318, + "grad_norm": 0.7179747223854065, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0281, + "step": 26350 + }, + { + "epoch": 0.7536812008577556, + "grad_norm": 0.5399336218833923, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.03, + "step": 26360 + }, + { + "epoch": 0.7539671193709793, + "grad_norm": 0.5521562099456787, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.0267, + "step": 26370 + }, + { + "epoch": 0.754253037884203, + "grad_norm": 0.3727903366088867, + "learning_rate": 2.683592557860616e-06, + "loss": 0.0274, + "step": 26380 + }, + { + "epoch": 0.7545389563974267, + "grad_norm": 0.5607078671455383, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.033, + "step": 26390 + }, + { + "epoch": 0.7548248749106504, + "grad_norm": 0.3736121654510498, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.0267, + "step": 26400 + }, + { + "epoch": 0.7551107934238742, + "grad_norm": 0.47778844833374023, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0246, + "step": 26410 + }, + { + "epoch": 0.7553967119370979, + "grad_norm": 0.5479125380516052, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.0273, + "step": 26420 + }, + { + "epoch": 0.7556826304503217, + "grad_norm": 0.5152542591094971, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0288, + "step": 26430 + }, + { + "epoch": 0.7559685489635454, + "grad_norm": 0.38652661442756653, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0319, + "step": 26440 + }, + { + "epoch": 0.7562544674767692, + "grad_norm": 0.8551011085510254, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.0312, + "step": 26450 + }, + { + "epoch": 0.7565403859899928, + "grad_norm": 0.5332438349723816, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0348, + "step": 26460 + }, + { + "epoch": 0.7568263045032165, + "grad_norm": 0.5529776215553284, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.0305, + "step": 26470 + }, + { + "epoch": 0.7571122230164403, + "grad_norm": 0.47610723972320557, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.027, + "step": 26480 + }, + { + "epoch": 0.757398141529664, + "grad_norm": 0.5565681457519531, + "learning_rate": 2.643185095075473e-06, + "loss": 0.0277, + "step": 26490 + }, + { + "epoch": 0.7576840600428878, + "grad_norm": 0.40319734811782837, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0287, + "step": 26500 + }, + { + "epoch": 0.7579699785561115, + "grad_norm": 0.5117385387420654, + "learning_rate": 2.635965610168249e-06, + "loss": 0.0312, + "step": 26510 + }, + { + "epoch": 0.7582558970693353, + "grad_norm": 0.47812822461128235, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0287, + "step": 26520 + }, + { + "epoch": 0.758541815582559, + "grad_norm": 0.24216991662979126, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0321, + "step": 26530 + }, + { + "epoch": 0.7588277340957827, + "grad_norm": 0.24864375591278076, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0312, + "step": 26540 + }, + { + "epoch": 0.7591136526090064, + "grad_norm": 0.39162659645080566, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0333, + "step": 26550 + }, + { + "epoch": 0.7593995711222301, + "grad_norm": 0.30692365765571594, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.0261, + "step": 26560 + }, + { + "epoch": 0.7596854896354539, + "grad_norm": 0.5904929041862488, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0309, + "step": 26570 + }, + { + "epoch": 0.7599714081486776, + "grad_norm": 0.5509836673736572, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0223, + "step": 26580 + }, + { + "epoch": 0.7602573266619014, + "grad_norm": 0.45913293957710266, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0338, + "step": 26590 + }, + { + "epoch": 0.7605432451751251, + "grad_norm": 0.3952873647212982, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.0283, + "step": 26600 + }, + { + "epoch": 0.7608291636883489, + "grad_norm": 0.49259039759635925, + "learning_rate": 2.600457796416397e-06, + "loss": 0.0262, + "step": 26610 + }, + { + "epoch": 0.7611150822015725, + "grad_norm": 0.49096909165382385, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.0265, + "step": 26620 + }, + { + "epoch": 0.7614010007147963, + "grad_norm": 0.48913729190826416, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0347, + "step": 26630 + }, + { + "epoch": 0.76168691922802, + "grad_norm": 0.391233891248703, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0281, + "step": 26640 + }, + { + "epoch": 0.7619728377412437, + "grad_norm": 0.3726404011249542, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0252, + "step": 26650 + }, + { + "epoch": 0.7622587562544675, + "grad_norm": 0.441919207572937, + "learning_rate": 2.583073279935805e-06, + "loss": 0.025, + "step": 26660 + }, + { + "epoch": 0.7625446747676912, + "grad_norm": 0.6720325350761414, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.0264, + "step": 26670 + }, + { + "epoch": 0.762830593280915, + "grad_norm": 0.4706156849861145, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0307, + "step": 26680 + }, + { + "epoch": 0.7631165117941386, + "grad_norm": 0.6154748797416687, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0283, + "step": 26690 + }, + { + "epoch": 0.7634024303073624, + "grad_norm": 0.4765104651451111, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.0292, + "step": 26700 + }, + { + "epoch": 0.7636883488205861, + "grad_norm": 0.33775731921195984, + "learning_rate": 2.565935706183804e-06, + "loss": 0.0281, + "step": 26710 + }, + { + "epoch": 0.7639742673338099, + "grad_norm": 0.9325317144393921, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0282, + "step": 26720 + }, + { + "epoch": 0.7642601858470336, + "grad_norm": 0.5118368864059448, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0264, + "step": 26730 + }, + { + "epoch": 0.7645461043602573, + "grad_norm": 0.6633817553520203, + "learning_rate": 2.555771903907403e-06, + "loss": 0.035, + "step": 26740 + }, + { + "epoch": 0.7648320228734811, + "grad_norm": 0.8666901588439941, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0299, + "step": 26750 + }, + { + "epoch": 0.7651179413867047, + "grad_norm": 0.47465914487838745, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0281, + "step": 26760 + }, + { + "epoch": 0.7654038598999285, + "grad_norm": 0.5317928791046143, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0237, + "step": 26770 + }, + { + "epoch": 0.7656897784131522, + "grad_norm": 0.6626484394073486, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.0297, + "step": 26780 + }, + { + "epoch": 0.765975696926376, + "grad_norm": 0.5603852272033691, + "learning_rate": 2.5390304813179e-06, + "loss": 0.0279, + "step": 26790 + }, + { + "epoch": 0.7662616154395997, + "grad_norm": 0.392030268907547, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.0276, + "step": 26800 + }, + { + "epoch": 0.7665475339528235, + "grad_norm": 0.5270085334777832, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0283, + "step": 26810 + }, + { + "epoch": 0.7668334524660472, + "grad_norm": 0.5256703495979309, + "learning_rate": 2.529104749380281e-06, + "loss": 0.029, + "step": 26820 + }, + { + "epoch": 0.7671193709792709, + "grad_norm": 0.3960905075073242, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0297, + "step": 26830 + }, + { + "epoch": 0.7674052894924946, + "grad_norm": 0.4214257597923279, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0279, + "step": 26840 + }, + { + "epoch": 0.7676912080057183, + "grad_norm": 0.4516659677028656, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0268, + "step": 26850 + }, + { + "epoch": 0.7679771265189421, + "grad_norm": 0.4527135193347931, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0285, + "step": 26860 + }, + { + "epoch": 0.7682630450321658, + "grad_norm": 0.4458029270172119, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0282, + "step": 26870 + }, + { + "epoch": 0.7685489635453896, + "grad_norm": 0.5262351036071777, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.0289, + "step": 26880 + }, + { + "epoch": 0.7688348820586133, + "grad_norm": 0.7576776146888733, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0304, + "step": 26890 + }, + { + "epoch": 0.7691208005718371, + "grad_norm": 0.3779038190841675, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0308, + "step": 26900 + }, + { + "epoch": 0.7694067190850608, + "grad_norm": 0.5801526308059692, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0279, + "step": 26910 + }, + { + "epoch": 0.7696926375982844, + "grad_norm": 0.6423588991165161, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0291, + "step": 26920 + }, + { + "epoch": 0.7699785561115082, + "grad_norm": 0.3891446590423584, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0276, + "step": 26930 + }, + { + "epoch": 0.7702644746247319, + "grad_norm": 0.6453003883361816, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0297, + "step": 26940 + }, + { + "epoch": 0.7705503931379557, + "grad_norm": 0.5512704253196716, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0273, + "step": 26950 + }, + { + "epoch": 0.7708363116511794, + "grad_norm": 0.5719016790390015, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0284, + "step": 26960 + }, + { + "epoch": 0.7711222301644032, + "grad_norm": 0.325624942779541, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0249, + "step": 26970 + }, + { + "epoch": 0.7714081486776269, + "grad_norm": 0.5242589712142944, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0268, + "step": 26980 + }, + { + "epoch": 0.7716940671908507, + "grad_norm": 0.3835712969303131, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.0293, + "step": 26990 + }, + { + "epoch": 0.7719799857040743, + "grad_norm": 0.5894249081611633, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0303, + "step": 27000 + }, + { + "epoch": 0.772265904217298, + "grad_norm": 0.4519590437412262, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.0252, + "step": 27010 + }, + { + "epoch": 0.7725518227305218, + "grad_norm": 0.590528130531311, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0297, + "step": 27020 + }, + { + "epoch": 0.7728377412437455, + "grad_norm": 0.5418447852134705, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0306, + "step": 27030 + }, + { + "epoch": 0.7731236597569693, + "grad_norm": 1.027212142944336, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.0302, + "step": 27040 + }, + { + "epoch": 0.773409578270193, + "grad_norm": 0.5057966709136963, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.0295, + "step": 27050 + }, + { + "epoch": 0.7736954967834168, + "grad_norm": 0.9749689698219299, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.0288, + "step": 27060 + }, + { + "epoch": 0.7739814152966404, + "grad_norm": 0.7263986468315125, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.03, + "step": 27070 + }, + { + "epoch": 0.7742673338098642, + "grad_norm": 0.6080947518348694, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0312, + "step": 27080 + }, + { + "epoch": 0.7745532523230879, + "grad_norm": 0.5187621712684631, + "learning_rate": 2.443811559007335e-06, + "loss": 0.0235, + "step": 27090 + }, + { + "epoch": 0.7748391708363116, + "grad_norm": 0.6019864678382874, + "learning_rate": 2.440792688039862e-06, + "loss": 0.0356, + "step": 27100 + }, + { + "epoch": 0.7751250893495354, + "grad_norm": 0.4716169238090515, + "learning_rate": 2.437783861778914e-06, + "loss": 0.0241, + "step": 27110 + }, + { + "epoch": 0.7754110078627591, + "grad_norm": 0.2648717761039734, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.027, + "step": 27120 + }, + { + "epoch": 0.7756969263759829, + "grad_norm": 0.43119028210639954, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0278, + "step": 27130 + }, + { + "epoch": 0.7759828448892065, + "grad_norm": 0.37466534972190857, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0254, + "step": 27140 + }, + { + "epoch": 0.7762687634024303, + "grad_norm": 0.36353442072868347, + "learning_rate": 2.425849074243997e-06, + "loss": 0.0263, + "step": 27150 + }, + { + "epoch": 0.776554681915654, + "grad_norm": 0.35461705923080444, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0281, + "step": 27160 + }, + { + "epoch": 0.7768406004288778, + "grad_norm": 0.5017783045768738, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0312, + "step": 27170 + }, + { + "epoch": 0.7771265189421015, + "grad_norm": 0.461370050907135, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0301, + "step": 27180 + }, + { + "epoch": 0.7774124374553252, + "grad_norm": 0.3844483494758606, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0276, + "step": 27190 + }, + { + "epoch": 0.777698355968549, + "grad_norm": 0.32640641927719116, + "learning_rate": 2.411157015949963e-06, + "loss": 0.0262, + "step": 27200 + }, + { + "epoch": 0.7779842744817727, + "grad_norm": 0.6539550423622131, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0303, + "step": 27210 + }, + { + "epoch": 0.7782701929949964, + "grad_norm": 0.5505805015563965, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0267, + "step": 27220 + }, + { + "epoch": 0.7785561115082201, + "grad_norm": 0.433768630027771, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0299, + "step": 27230 + }, + { + "epoch": 0.7788420300214439, + "grad_norm": 0.7262346148490906, + "learning_rate": 2.399584779188417e-06, + "loss": 0.0278, + "step": 27240 + }, + { + "epoch": 0.7791279485346676, + "grad_norm": 0.6827511787414551, + "learning_rate": 2.396716944205467e-06, + "loss": 0.0319, + "step": 27250 + }, + { + "epoch": 0.7794138670478914, + "grad_norm": 0.3138200342655182, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.0261, + "step": 27260 + }, + { + "epoch": 0.7796997855611151, + "grad_norm": 0.36588770151138306, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0251, + "step": 27270 + }, + { + "epoch": 0.7799857040743388, + "grad_norm": 1.105770468711853, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0272, + "step": 27280 + }, + { + "epoch": 0.7802716225875626, + "grad_norm": 0.4482360780239105, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.0247, + "step": 27290 + }, + { + "epoch": 0.7805575411007862, + "grad_norm": 0.5545430779457092, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0307, + "step": 27300 + }, + { + "epoch": 0.78084345961401, + "grad_norm": 0.45449620485305786, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0264, + "step": 27310 + }, + { + "epoch": 0.7811293781272337, + "grad_norm": 0.37734025716781616, + "learning_rate": 2.376924986395271e-06, + "loss": 0.0275, + "step": 27320 + }, + { + "epoch": 0.7814152966404575, + "grad_norm": 0.47029784321784973, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0325, + "step": 27330 + }, + { + "epoch": 0.7817012151536812, + "grad_norm": 0.3540012240409851, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0258, + "step": 27340 + }, + { + "epoch": 0.781987133666905, + "grad_norm": 0.8363472819328308, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0288, + "step": 27350 + }, + { + "epoch": 0.7822730521801287, + "grad_norm": 0.5943127274513245, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.0289, + "step": 27360 + }, + { + "epoch": 0.7825589706933523, + "grad_norm": 0.48346707224845886, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0304, + "step": 27370 + }, + { + "epoch": 0.7828448892065761, + "grad_norm": 0.5776712894439697, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.0262, + "step": 27380 + }, + { + "epoch": 0.7831308077197998, + "grad_norm": 0.37524285912513733, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0342, + "step": 27390 + }, + { + "epoch": 0.7834167262330236, + "grad_norm": 0.4272121787071228, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.0262, + "step": 27400 + }, + { + "epoch": 0.7837026447462473, + "grad_norm": 0.3545357286930084, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.0273, + "step": 27410 + }, + { + "epoch": 0.7839885632594711, + "grad_norm": 0.4780922830104828, + "learning_rate": 2.349511203900333e-06, + "loss": 0.0255, + "step": 27420 + }, + { + "epoch": 0.7842744817726948, + "grad_norm": 0.6846514940261841, + "learning_rate": 2.3468256081258e-06, + "loss": 0.035, + "step": 27430 + }, + { + "epoch": 0.7845604002859186, + "grad_norm": 0.6890650391578674, + "learning_rate": 2.344150167333397e-06, + "loss": 0.0305, + "step": 27440 + }, + { + "epoch": 0.7848463187991422, + "grad_norm": 0.41689804196357727, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0275, + "step": 27450 + }, + { + "epoch": 0.7851322373123659, + "grad_norm": 0.5169947743415833, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0261, + "step": 27460 + }, + { + "epoch": 0.7854181558255897, + "grad_norm": 0.3667839467525482, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0267, + "step": 27470 + }, + { + "epoch": 0.7857040743388134, + "grad_norm": 0.4650583267211914, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0259, + "step": 27480 + }, + { + "epoch": 0.7859899928520372, + "grad_norm": 0.5303590297698975, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0318, + "step": 27490 + }, + { + "epoch": 0.7862759113652609, + "grad_norm": 0.38010939955711365, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.0292, + "step": 27500 + }, + { + "epoch": 0.7865618298784847, + "grad_norm": 0.5952475070953369, + "learning_rate": 2.325706683525094e-06, + "loss": 0.0265, + "step": 27510 + }, + { + "epoch": 0.7868477483917083, + "grad_norm": 0.34000876545906067, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0255, + "step": 27520 + }, + { + "epoch": 0.7871336669049321, + "grad_norm": 0.333310604095459, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0266, + "step": 27530 + }, + { + "epoch": 0.7874195854181558, + "grad_norm": 1.0167195796966553, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0303, + "step": 27540 + }, + { + "epoch": 0.7877055039313795, + "grad_norm": 0.506395697593689, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0255, + "step": 27550 + }, + { + "epoch": 0.7879914224446033, + "grad_norm": 0.4995521008968353, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.0232, + "step": 27560 + }, + { + "epoch": 0.788277340957827, + "grad_norm": 0.592944324016571, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0271, + "step": 27570 + }, + { + "epoch": 0.7885632594710508, + "grad_norm": 0.5690013766288757, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0349, + "step": 27580 + }, + { + "epoch": 0.7888491779842745, + "grad_norm": 0.5303569436073303, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0301, + "step": 27590 + }, + { + "epoch": 0.7891350964974982, + "grad_norm": 0.4314960539340973, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0266, + "step": 27600 + }, + { + "epoch": 0.7894210150107219, + "grad_norm": 0.4138862192630768, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0237, + "step": 27610 + }, + { + "epoch": 0.7897069335239457, + "grad_norm": 0.5151752829551697, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.0268, + "step": 27620 + }, + { + "epoch": 0.7899928520371694, + "grad_norm": 0.7513082027435303, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.031, + "step": 27630 + }, + { + "epoch": 0.7902787705503931, + "grad_norm": 0.2644256055355072, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0259, + "step": 27640 + }, + { + "epoch": 0.7905646890636169, + "grad_norm": 0.5767413377761841, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0312, + "step": 27650 + }, + { + "epoch": 0.7908506075768406, + "grad_norm": 0.4754960536956787, + "learning_rate": 2.287865908463585e-06, + "loss": 0.035, + "step": 27660 + }, + { + "epoch": 0.7911365260900644, + "grad_norm": 0.4080045521259308, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.0271, + "step": 27670 + }, + { + "epoch": 0.791422444603288, + "grad_norm": 0.3843805193901062, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0312, + "step": 27680 + }, + { + "epoch": 0.7917083631165118, + "grad_norm": 0.3925490975379944, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0242, + "step": 27690 + }, + { + "epoch": 0.7919942816297355, + "grad_norm": 0.3966064155101776, + "learning_rate": 2.278163146933236e-06, + "loss": 0.0257, + "step": 27700 + }, + { + "epoch": 0.7922802001429593, + "grad_norm": 0.6077889204025269, + "learning_rate": 2.275763038367336e-06, + "loss": 0.0238, + "step": 27710 + }, + { + "epoch": 0.792566118656183, + "grad_norm": 0.6053628921508789, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0356, + "step": 27720 + }, + { + "epoch": 0.7928520371694067, + "grad_norm": 0.49703511595726013, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0283, + "step": 27730 + }, + { + "epoch": 0.7931379556826305, + "grad_norm": 0.5619977712631226, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0272, + "step": 27740 + }, + { + "epoch": 0.7934238741958541, + "grad_norm": 0.6108564734458923, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0271, + "step": 27750 + }, + { + "epoch": 0.7937097927090779, + "grad_norm": 0.4029979109764099, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0265, + "step": 27760 + }, + { + "epoch": 0.7939957112223016, + "grad_norm": 0.45793306827545166, + "learning_rate": 2.261577490652103e-06, + "loss": 0.0229, + "step": 27770 + }, + { + "epoch": 0.7942816297355254, + "grad_norm": 0.433551162481308, + "learning_rate": 2.259249109209093e-06, + "loss": 0.0264, + "step": 27780 + }, + { + "epoch": 0.7945675482487491, + "grad_norm": 0.4247429072856903, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0269, + "step": 27790 + }, + { + "epoch": 0.7948534667619729, + "grad_norm": 0.4973151981830597, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.0281, + "step": 27800 + }, + { + "epoch": 0.7951393852751966, + "grad_norm": 0.5111087560653687, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.0267, + "step": 27810 + }, + { + "epoch": 0.7954253037884202, + "grad_norm": 0.5530220866203308, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0291, + "step": 27820 + }, + { + "epoch": 0.795711222301644, + "grad_norm": 0.4368492662906647, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0302, + "step": 27830 + }, + { + "epoch": 0.7959971408148677, + "grad_norm": 0.5381907820701599, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0253, + "step": 27840 + }, + { + "epoch": 0.7962830593280915, + "grad_norm": 0.3638664186000824, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.0258, + "step": 27850 + }, + { + "epoch": 0.7965689778413152, + "grad_norm": 0.38014277815818787, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.0279, + "step": 27860 + }, + { + "epoch": 0.796854896354539, + "grad_norm": 0.46882548928260803, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.0272, + "step": 27870 + }, + { + "epoch": 0.7971408148677627, + "grad_norm": 0.4826337397098541, + "learning_rate": 2.236529916369313e-06, + "loss": 0.027, + "step": 27880 + }, + { + "epoch": 0.7974267333809865, + "grad_norm": 0.7986114621162415, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0263, + "step": 27890 + }, + { + "epoch": 0.7977126518942101, + "grad_norm": 0.5447944402694702, + "learning_rate": 2.232109406453595e-06, + "loss": 0.0321, + "step": 27900 + }, + { + "epoch": 0.7979985704074338, + "grad_norm": 0.21586239337921143, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0241, + "step": 27910 + }, + { + "epoch": 0.7982844889206576, + "grad_norm": 0.8066816926002502, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0285, + "step": 27920 + }, + { + "epoch": 0.7985704074338813, + "grad_norm": 0.5516615509986877, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0302, + "step": 27930 + }, + { + "epoch": 0.7988563259471051, + "grad_norm": 0.6859652996063232, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0244, + "step": 27940 + }, + { + "epoch": 0.7991422444603288, + "grad_norm": 0.5234702229499817, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0272, + "step": 27950 + }, + { + "epoch": 0.7994281629735526, + "grad_norm": 0.32633450627326965, + "learning_rate": 2.219094909262834e-06, + "loss": 0.0249, + "step": 27960 + }, + { + "epoch": 0.7997140814867763, + "grad_norm": 0.5086314678192139, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0278, + "step": 27970 + }, + { + "epoch": 0.8, + "grad_norm": 0.40988171100616455, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.0297, + "step": 27980 + }, + { + "epoch": 0.8002859185132237, + "grad_norm": 0.4648076891899109, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0271, + "step": 27990 + }, + { + "epoch": 0.8005718370264474, + "grad_norm": 0.7577387690544128, + "learning_rate": 2.210624641425579e-06, + "loss": 0.0328, + "step": 28000 + }, + { + "epoch": 0.8008577555396712, + "grad_norm": 0.39426741003990173, + "learning_rate": 2.208532855337684e-06, + "loss": 0.0243, + "step": 28010 + }, + { + "epoch": 0.8011436740528949, + "grad_norm": 0.5410818457603455, + "learning_rate": 2.2064513865261646e-06, + "loss": 0.0289, + "step": 28020 + }, + { + "epoch": 0.8014295925661187, + "grad_norm": 0.3485671281814575, + "learning_rate": 2.204380237433745e-06, + "loss": 0.0283, + "step": 28030 + }, + { + "epoch": 0.8017155110793424, + "grad_norm": 0.6367644667625427, + "learning_rate": 2.202319410491029e-06, + "loss": 0.0272, + "step": 28040 + }, + { + "epoch": 0.8020014295925662, + "grad_norm": 0.4387468099594116, + "learning_rate": 2.2002689081165155e-06, + "loss": 0.0278, + "step": 28050 + }, + { + "epoch": 0.8022873481057898, + "grad_norm": 0.7296497821807861, + "learning_rate": 2.1982287327165827e-06, + "loss": 0.0259, + "step": 28060 + }, + { + "epoch": 0.8025732666190136, + "grad_norm": 0.40375930070877075, + "learning_rate": 2.19619888668549e-06, + "loss": 0.0259, + "step": 28070 + }, + { + "epoch": 0.8028591851322373, + "grad_norm": 0.6340100169181824, + "learning_rate": 2.1941793724053733e-06, + "loss": 0.0313, + "step": 28080 + }, + { + "epoch": 0.803145103645461, + "grad_norm": 0.3453208804130554, + "learning_rate": 2.1921701922462463e-06, + "loss": 0.028, + "step": 28090 + }, + { + "epoch": 0.8034310221586848, + "grad_norm": 0.5798079967498779, + "learning_rate": 2.190171348565994e-06, + "loss": 0.0257, + "step": 28100 + }, + { + "epoch": 0.8037169406719085, + "grad_norm": 0.3712709844112396, + "learning_rate": 2.188182843710369e-06, + "loss": 0.0255, + "step": 28110 + }, + { + "epoch": 0.8040028591851323, + "grad_norm": 0.3774068355560303, + "learning_rate": 2.1862046800129964e-06, + "loss": 0.0292, + "step": 28120 + }, + { + "epoch": 0.8042887776983559, + "grad_norm": 0.6050616502761841, + "learning_rate": 2.1842368597953578e-06, + "loss": 0.0292, + "step": 28130 + }, + { + "epoch": 0.8045746962115797, + "grad_norm": 0.3443267047405243, + "learning_rate": 2.1822793853668e-06, + "loss": 0.0459, + "step": 28140 + }, + { + "epoch": 0.8048606147248034, + "grad_norm": 0.5769096612930298, + "learning_rate": 2.18033225902453e-06, + "loss": 0.0277, + "step": 28150 + }, + { + "epoch": 0.8051465332380272, + "grad_norm": 0.5020616054534912, + "learning_rate": 2.17839548305361e-06, + "loss": 0.0273, + "step": 28160 + }, + { + "epoch": 0.8054324517512509, + "grad_norm": 0.3149321973323822, + "learning_rate": 2.1764690597269507e-06, + "loss": 0.0282, + "step": 28170 + }, + { + "epoch": 0.8057183702644746, + "grad_norm": 0.3835159242153168, + "learning_rate": 2.17455299130532e-06, + "loss": 0.0295, + "step": 28180 + }, + { + "epoch": 0.8060042887776984, + "grad_norm": 0.6308208703994751, + "learning_rate": 2.17264728003733e-06, + "loss": 0.0333, + "step": 28190 + }, + { + "epoch": 0.806290207290922, + "grad_norm": 0.4104989767074585, + "learning_rate": 2.17075192815944e-06, + "loss": 0.0246, + "step": 28200 + }, + { + "epoch": 0.8065761258041458, + "grad_norm": 0.5490663051605225, + "learning_rate": 2.168866937895951e-06, + "loss": 0.0243, + "step": 28210 + }, + { + "epoch": 0.8068620443173695, + "grad_norm": 0.44579270482063293, + "learning_rate": 2.166992311459001e-06, + "loss": 0.0297, + "step": 28220 + }, + { + "epoch": 0.8071479628305933, + "grad_norm": 0.38116511702537537, + "learning_rate": 2.1651280510485727e-06, + "loss": 0.0223, + "step": 28230 + }, + { + "epoch": 0.807433881343817, + "grad_norm": 0.5825269222259521, + "learning_rate": 2.163274158852476e-06, + "loss": 0.0289, + "step": 28240 + }, + { + "epoch": 0.8077197998570408, + "grad_norm": 0.396100789308548, + "learning_rate": 2.1614306370463605e-06, + "loss": 0.0287, + "step": 28250 + }, + { + "epoch": 0.8080057183702645, + "grad_norm": 0.3408491611480713, + "learning_rate": 2.1595974877936977e-06, + "loss": 0.0275, + "step": 28260 + }, + { + "epoch": 0.8082916368834882, + "grad_norm": 0.4204134941101074, + "learning_rate": 2.1577747132457933e-06, + "loss": 0.0289, + "step": 28270 + }, + { + "epoch": 0.8085775553967119, + "grad_norm": 1.1906534433364868, + "learning_rate": 2.155962315541773e-06, + "loss": 0.0302, + "step": 28280 + }, + { + "epoch": 0.8088634739099356, + "grad_norm": 0.4449160397052765, + "learning_rate": 2.154160296808588e-06, + "loss": 0.026, + "step": 28290 + }, + { + "epoch": 0.8091493924231594, + "grad_norm": 0.9066163301467896, + "learning_rate": 2.1523686591610064e-06, + "loss": 0.0289, + "step": 28300 + }, + { + "epoch": 0.8094353109363831, + "grad_norm": 0.30709517002105713, + "learning_rate": 2.1505874047016146e-06, + "loss": 0.0218, + "step": 28310 + }, + { + "epoch": 0.8097212294496069, + "grad_norm": 0.3318001329898834, + "learning_rate": 2.1488165355208147e-06, + "loss": 0.0281, + "step": 28320 + }, + { + "epoch": 0.8100071479628306, + "grad_norm": 0.34999215602874756, + "learning_rate": 2.14705605369682e-06, + "loss": 0.0285, + "step": 28330 + }, + { + "epoch": 0.8102930664760544, + "grad_norm": 0.41680973768234253, + "learning_rate": 2.145305961295655e-06, + "loss": 0.0305, + "step": 28340 + }, + { + "epoch": 0.810578984989278, + "grad_norm": 0.3743407428264618, + "learning_rate": 2.143566260371149e-06, + "loss": 0.0256, + "step": 28350 + }, + { + "epoch": 0.8108649035025017, + "grad_norm": 0.872268795967102, + "learning_rate": 2.141836952964938e-06, + "loss": 0.0259, + "step": 28360 + }, + { + "epoch": 0.8111508220157255, + "grad_norm": 0.36687538027763367, + "learning_rate": 2.1401180411064616e-06, + "loss": 0.0279, + "step": 28370 + }, + { + "epoch": 0.8114367405289492, + "grad_norm": 0.511329174041748, + "learning_rate": 2.138409526812959e-06, + "loss": 0.0275, + "step": 28380 + }, + { + "epoch": 0.811722659042173, + "grad_norm": 0.3234724998474121, + "learning_rate": 2.1367114120894663e-06, + "loss": 0.0257, + "step": 28390 + }, + { + "epoch": 0.8120085775553967, + "grad_norm": 0.5732539296150208, + "learning_rate": 2.1350236989288136e-06, + "loss": 0.0288, + "step": 28400 + }, + { + "epoch": 0.8122944960686205, + "grad_norm": 0.4985447824001312, + "learning_rate": 2.1333463893116294e-06, + "loss": 0.0248, + "step": 28410 + }, + { + "epoch": 0.8125804145818442, + "grad_norm": 0.49544450640678406, + "learning_rate": 2.131679485206329e-06, + "loss": 0.0295, + "step": 28420 + }, + { + "epoch": 0.812866333095068, + "grad_norm": 1.0728929042816162, + "learning_rate": 2.130022988569117e-06, + "loss": 0.0313, + "step": 28430 + }, + { + "epoch": 0.8131522516082916, + "grad_norm": 0.2358589768409729, + "learning_rate": 2.128376901343984e-06, + "loss": 0.0218, + "step": 28440 + }, + { + "epoch": 0.8134381701215153, + "grad_norm": 0.518035352230072, + "learning_rate": 2.1267412254627056e-06, + "loss": 0.0292, + "step": 28450 + }, + { + "epoch": 0.8137240886347391, + "grad_norm": 0.43305718898773193, + "learning_rate": 2.1251159628448386e-06, + "loss": 0.0301, + "step": 28460 + }, + { + "epoch": 0.8140100071479628, + "grad_norm": 0.7385976314544678, + "learning_rate": 2.1235011153977192e-06, + "loss": 0.0263, + "step": 28470 + }, + { + "epoch": 0.8142959256611866, + "grad_norm": 0.435623437166214, + "learning_rate": 2.121896685016461e-06, + "loss": 0.0283, + "step": 28480 + }, + { + "epoch": 0.8145818441744103, + "grad_norm": 0.5866786241531372, + "learning_rate": 2.1203026735839514e-06, + "loss": 0.0257, + "step": 28490 + }, + { + "epoch": 0.8148677626876341, + "grad_norm": 0.8038771152496338, + "learning_rate": 2.118719082970852e-06, + "loss": 0.0315, + "step": 28500 + }, + { + "epoch": 0.8151536812008577, + "grad_norm": 0.33963197469711304, + "learning_rate": 2.1171459150355947e-06, + "loss": 0.0306, + "step": 28510 + }, + { + "epoch": 0.8154395997140815, + "grad_norm": 0.5540177822113037, + "learning_rate": 2.115583171624381e-06, + "loss": 0.0263, + "step": 28520 + }, + { + "epoch": 0.8157255182273052, + "grad_norm": 0.5438565015792847, + "learning_rate": 2.114030854571176e-06, + "loss": 0.0307, + "step": 28530 + }, + { + "epoch": 0.8160114367405289, + "grad_norm": 0.36572158336639404, + "learning_rate": 2.1124889656977097e-06, + "loss": 0.0265, + "step": 28540 + }, + { + "epoch": 0.8162973552537527, + "grad_norm": 0.27488043904304504, + "learning_rate": 2.1109575068134756e-06, + "loss": 0.0243, + "step": 28550 + }, + { + "epoch": 0.8165832737669764, + "grad_norm": 0.5578693151473999, + "learning_rate": 2.1094364797157267e-06, + "loss": 0.0317, + "step": 28560 + }, + { + "epoch": 0.8168691922802002, + "grad_norm": 0.7271984815597534, + "learning_rate": 2.107925886189472e-06, + "loss": 0.0246, + "step": 28570 + }, + { + "epoch": 0.8171551107934238, + "grad_norm": 0.8810591697692871, + "learning_rate": 2.1064257280074763e-06, + "loss": 0.0285, + "step": 28580 + }, + { + "epoch": 0.8174410293066476, + "grad_norm": 0.43811503052711487, + "learning_rate": 2.1049360069302594e-06, + "loss": 0.0264, + "step": 28590 + }, + { + "epoch": 0.8177269478198713, + "grad_norm": 0.4820844531059265, + "learning_rate": 2.1034567247060926e-06, + "loss": 0.0292, + "step": 28600 + }, + { + "epoch": 0.8180128663330951, + "grad_norm": 0.4385477900505066, + "learning_rate": 2.1019878830709968e-06, + "loss": 0.026, + "step": 28610 + }, + { + "epoch": 0.8182987848463188, + "grad_norm": 0.6242631673812866, + "learning_rate": 2.100529483748737e-06, + "loss": 0.0275, + "step": 28620 + }, + { + "epoch": 0.8185847033595425, + "grad_norm": 0.5929499864578247, + "learning_rate": 2.099081528450828e-06, + "loss": 0.0228, + "step": 28630 + }, + { + "epoch": 0.8188706218727663, + "grad_norm": 0.9688727259635925, + "learning_rate": 2.097644018876524e-06, + "loss": 0.0233, + "step": 28640 + }, + { + "epoch": 0.81915654038599, + "grad_norm": 0.3581937849521637, + "learning_rate": 2.096216956712826e-06, + "loss": 0.0257, + "step": 28650 + }, + { + "epoch": 0.8194424588992137, + "grad_norm": 0.29479968547821045, + "learning_rate": 2.0948003436344666e-06, + "loss": 0.0275, + "step": 28660 + }, + { + "epoch": 0.8197283774124374, + "grad_norm": 0.5298082232475281, + "learning_rate": 2.0933941813039244e-06, + "loss": 0.0282, + "step": 28670 + }, + { + "epoch": 0.8200142959256612, + "grad_norm": 0.3596552610397339, + "learning_rate": 2.091998471371406e-06, + "loss": 0.0279, + "step": 28680 + }, + { + "epoch": 0.8203002144388849, + "grad_norm": 0.5539126396179199, + "learning_rate": 2.0906132154748557e-06, + "loss": 0.0233, + "step": 28690 + }, + { + "epoch": 0.8205861329521087, + "grad_norm": 0.7187175154685974, + "learning_rate": 2.0892384152399504e-06, + "loss": 0.0352, + "step": 28700 + }, + { + "epoch": 0.8208720514653324, + "grad_norm": 0.5331593155860901, + "learning_rate": 2.0878740722800917e-06, + "loss": 0.0258, + "step": 28710 + }, + { + "epoch": 0.8211579699785561, + "grad_norm": 0.7345555424690247, + "learning_rate": 2.086520188196413e-06, + "loss": 0.0332, + "step": 28720 + }, + { + "epoch": 0.8214438884917799, + "grad_norm": 0.3517567217350006, + "learning_rate": 2.085176764577774e-06, + "loss": 0.0297, + "step": 28730 + }, + { + "epoch": 0.8217298070050035, + "grad_norm": 0.5344868302345276, + "learning_rate": 2.083843803000755e-06, + "loss": 0.031, + "step": 28740 + }, + { + "epoch": 0.8220157255182273, + "grad_norm": 0.34053468704223633, + "learning_rate": 2.0825213050296636e-06, + "loss": 0.0262, + "step": 28750 + }, + { + "epoch": 0.822301644031451, + "grad_norm": 0.5699623227119446, + "learning_rate": 2.081209272216522e-06, + "loss": 0.0304, + "step": 28760 + }, + { + "epoch": 0.8225875625446748, + "grad_norm": 0.788611888885498, + "learning_rate": 2.079907706101075e-06, + "loss": 0.0301, + "step": 28770 + }, + { + "epoch": 0.8228734810578985, + "grad_norm": 0.43659698963165283, + "learning_rate": 2.0786166082107833e-06, + "loss": 0.0264, + "step": 28780 + }, + { + "epoch": 0.8231593995711223, + "grad_norm": 0.4736769199371338, + "learning_rate": 2.0773359800608217e-06, + "loss": 0.0255, + "step": 28790 + }, + { + "epoch": 0.823445318084346, + "grad_norm": 0.39587756991386414, + "learning_rate": 2.076065823154079e-06, + "loss": 0.0273, + "step": 28800 + }, + { + "epoch": 0.8237312365975696, + "grad_norm": 0.4977654218673706, + "learning_rate": 2.0748061389811543e-06, + "loss": 0.0284, + "step": 28810 + }, + { + "epoch": 0.8240171551107934, + "grad_norm": 0.6429978013038635, + "learning_rate": 2.073556929020357e-06, + "loss": 0.0284, + "step": 28820 + }, + { + "epoch": 0.8243030736240171, + "grad_norm": 0.41012001037597656, + "learning_rate": 2.0723181947377057e-06, + "loss": 0.0249, + "step": 28830 + }, + { + "epoch": 0.8245889921372409, + "grad_norm": 0.4937886595726013, + "learning_rate": 2.0710899375869237e-06, + "loss": 0.0321, + "step": 28840 + }, + { + "epoch": 0.8248749106504646, + "grad_norm": 0.3416379988193512, + "learning_rate": 2.0698721590094387e-06, + "loss": 0.0261, + "step": 28850 + }, + { + "epoch": 0.8251608291636884, + "grad_norm": 0.6058022379875183, + "learning_rate": 2.0686648604343824e-06, + "loss": 0.0261, + "step": 28860 + }, + { + "epoch": 0.8254467476769121, + "grad_norm": 0.5241106152534485, + "learning_rate": 2.067468043278587e-06, + "loss": 0.0268, + "step": 28870 + }, + { + "epoch": 0.8257326661901359, + "grad_norm": 0.4829743802547455, + "learning_rate": 2.066281708946583e-06, + "loss": 0.0302, + "step": 28880 + }, + { + "epoch": 0.8260185847033595, + "grad_norm": 0.5692874789237976, + "learning_rate": 2.0651058588306007e-06, + "loss": 0.0283, + "step": 28890 + }, + { + "epoch": 0.8263045032165832, + "grad_norm": 0.3703140914440155, + "learning_rate": 2.063940494310565e-06, + "loss": 0.0236, + "step": 28900 + }, + { + "epoch": 0.826590421729807, + "grad_norm": 0.44660329818725586, + "learning_rate": 2.062785616754097e-06, + "loss": 0.024, + "step": 28910 + }, + { + "epoch": 0.8268763402430307, + "grad_norm": 0.5036526918411255, + "learning_rate": 2.0616412275165097e-06, + "loss": 0.0259, + "step": 28920 + }, + { + "epoch": 0.8271622587562545, + "grad_norm": 0.40677565336227417, + "learning_rate": 2.0605073279408063e-06, + "loss": 0.0287, + "step": 28930 + }, + { + "epoch": 0.8274481772694782, + "grad_norm": 0.37139707803726196, + "learning_rate": 2.0593839193576833e-06, + "loss": 0.0274, + "step": 28940 + }, + { + "epoch": 0.827734095782702, + "grad_norm": 0.4395834803581238, + "learning_rate": 2.058271003085521e-06, + "loss": 0.0243, + "step": 28950 + }, + { + "epoch": 0.8280200142959256, + "grad_norm": 0.41849637031555176, + "learning_rate": 2.0571685804303905e-06, + "loss": 0.0364, + "step": 28960 + }, + { + "epoch": 0.8283059328091494, + "grad_norm": 0.532619833946228, + "learning_rate": 2.0560766526860447e-06, + "loss": 0.0257, + "step": 28970 + }, + { + "epoch": 0.8285918513223731, + "grad_norm": 0.6253917813301086, + "learning_rate": 2.054995221133923e-06, + "loss": 0.0276, + "step": 28980 + }, + { + "epoch": 0.8288777698355968, + "grad_norm": 0.5655578374862671, + "learning_rate": 2.053924287043144e-06, + "loss": 0.0255, + "step": 28990 + }, + { + "epoch": 0.8291636883488206, + "grad_norm": 0.46769171953201294, + "learning_rate": 2.0528638516705106e-06, + "loss": 0.0303, + "step": 29000 + }, + { + "epoch": 0.8294496068620443, + "grad_norm": 0.5021181702613831, + "learning_rate": 2.051813916260501e-06, + "loss": 0.0287, + "step": 29010 + }, + { + "epoch": 0.8297355253752681, + "grad_norm": 0.6354414820671082, + "learning_rate": 2.050774482045273e-06, + "loss": 0.0243, + "step": 29020 + }, + { + "epoch": 0.8300214438884918, + "grad_norm": 0.3923643231391907, + "learning_rate": 2.049745550244661e-06, + "loss": 0.027, + "step": 29030 + }, + { + "epoch": 0.8303073624017155, + "grad_norm": 0.7594497203826904, + "learning_rate": 2.0487271220661735e-06, + "loss": 0.0303, + "step": 29040 + }, + { + "epoch": 0.8305932809149392, + "grad_norm": 1.0209988355636597, + "learning_rate": 2.047719198704994e-06, + "loss": 0.0319, + "step": 29050 + }, + { + "epoch": 0.830879199428163, + "grad_norm": 0.4946758449077606, + "learning_rate": 2.0467217813439762e-06, + "loss": 0.0238, + "step": 29060 + }, + { + "epoch": 0.8311651179413867, + "grad_norm": 0.3641301095485687, + "learning_rate": 2.0457348711536426e-06, + "loss": 0.0299, + "step": 29070 + }, + { + "epoch": 0.8314510364546104, + "grad_norm": 0.4918605387210846, + "learning_rate": 2.0447584692921894e-06, + "loss": 0.0275, + "step": 29080 + }, + { + "epoch": 0.8317369549678342, + "grad_norm": 0.5097633004188538, + "learning_rate": 2.043792576905478e-06, + "loss": 0.0301, + "step": 29090 + }, + { + "epoch": 0.8320228734810579, + "grad_norm": 0.7338542938232422, + "learning_rate": 2.0428371951270394e-06, + "loss": 0.0261, + "step": 29100 + }, + { + "epoch": 0.8323087919942816, + "grad_norm": 0.5453478097915649, + "learning_rate": 2.0418923250780633e-06, + "loss": 0.0279, + "step": 29110 + }, + { + "epoch": 0.8325947105075053, + "grad_norm": 0.38167792558670044, + "learning_rate": 2.0409579678674084e-06, + "loss": 0.0245, + "step": 29120 + }, + { + "epoch": 0.8328806290207291, + "grad_norm": 0.75771164894104, + "learning_rate": 2.040034124591597e-06, + "loss": 0.0305, + "step": 29130 + }, + { + "epoch": 0.8331665475339528, + "grad_norm": 0.820286214351654, + "learning_rate": 2.039120796334809e-06, + "loss": 0.0277, + "step": 29140 + }, + { + "epoch": 0.8334524660471766, + "grad_norm": 0.26554685831069946, + "learning_rate": 2.0382179841688868e-06, + "loss": 0.0309, + "step": 29150 + }, + { + "epoch": 0.8337383845604003, + "grad_norm": 0.5767927765846252, + "learning_rate": 2.0373256891533293e-06, + "loss": 0.0264, + "step": 29160 + }, + { + "epoch": 0.834024303073624, + "grad_norm": 0.45167428255081177, + "learning_rate": 2.0364439123352956e-06, + "loss": 0.0313, + "step": 29170 + }, + { + "epoch": 0.8343102215868478, + "grad_norm": 0.49742552638053894, + "learning_rate": 2.0355726547495998e-06, + "loss": 0.0263, + "step": 29180 + }, + { + "epoch": 0.8345961401000714, + "grad_norm": 0.3473125398159027, + "learning_rate": 2.034711917418711e-06, + "loss": 0.0329, + "step": 29190 + }, + { + "epoch": 0.8348820586132952, + "grad_norm": 0.6035035848617554, + "learning_rate": 2.033861701352752e-06, + "loss": 0.027, + "step": 29200 + }, + { + "epoch": 0.8351679771265189, + "grad_norm": 0.4132099449634552, + "learning_rate": 2.0330220075494992e-06, + "loss": 0.0284, + "step": 29210 + }, + { + "epoch": 0.8354538956397427, + "grad_norm": 0.3904581367969513, + "learning_rate": 2.0321928369943807e-06, + "loss": 0.0271, + "step": 29220 + }, + { + "epoch": 0.8357398141529664, + "grad_norm": 0.40810489654541016, + "learning_rate": 2.031374190660474e-06, + "loss": 0.026, + "step": 29230 + }, + { + "epoch": 0.8360257326661902, + "grad_norm": 0.5281389355659485, + "learning_rate": 2.0305660695085054e-06, + "loss": 0.0264, + "step": 29240 + }, + { + "epoch": 0.8363116511794139, + "grad_norm": 0.3951243758201599, + "learning_rate": 2.0297684744868494e-06, + "loss": 0.0272, + "step": 29250 + }, + { + "epoch": 0.8365975696926375, + "grad_norm": 0.3633115291595459, + "learning_rate": 2.0289814065315306e-06, + "loss": 0.0248, + "step": 29260 + }, + { + "epoch": 0.8368834882058613, + "grad_norm": 0.7965249419212341, + "learning_rate": 2.0282048665662153e-06, + "loss": 0.0443, + "step": 29270 + }, + { + "epoch": 0.837169406719085, + "grad_norm": 0.6424257159233093, + "learning_rate": 2.0274388555022176e-06, + "loss": 0.0257, + "step": 29280 + }, + { + "epoch": 0.8374553252323088, + "grad_norm": 0.4431753158569336, + "learning_rate": 2.0266833742384928e-06, + "loss": 0.0309, + "step": 29290 + }, + { + "epoch": 0.8377412437455325, + "grad_norm": 0.6503756046295166, + "learning_rate": 2.0259384236616404e-06, + "loss": 0.0275, + "step": 29300 + }, + { + "epoch": 0.8380271622587563, + "grad_norm": 0.5955492258071899, + "learning_rate": 2.0252040046459022e-06, + "loss": 0.0282, + "step": 29310 + }, + { + "epoch": 0.83831308077198, + "grad_norm": 0.3691128194332123, + "learning_rate": 2.02448011805316e-06, + "loss": 0.0321, + "step": 29320 + }, + { + "epoch": 0.8385989992852038, + "grad_norm": 0.42951759696006775, + "learning_rate": 2.023766764732934e-06, + "loss": 0.0254, + "step": 29330 + }, + { + "epoch": 0.8388849177984274, + "grad_norm": 0.5496651530265808, + "learning_rate": 2.0230639455223853e-06, + "loss": 0.0273, + "step": 29340 + }, + { + "epoch": 0.8391708363116511, + "grad_norm": 0.44067010283470154, + "learning_rate": 2.0223716612463095e-06, + "loss": 0.0253, + "step": 29350 + }, + { + "epoch": 0.8394567548248749, + "grad_norm": 0.5913621783256531, + "learning_rate": 2.0216899127171424e-06, + "loss": 0.0244, + "step": 29360 + }, + { + "epoch": 0.8397426733380986, + "grad_norm": 0.5189345479011536, + "learning_rate": 2.0210187007349534e-06, + "loss": 0.0278, + "step": 29370 + }, + { + "epoch": 0.8400285918513224, + "grad_norm": 0.5561279058456421, + "learning_rate": 2.0203580260874474e-06, + "loss": 0.0293, + "step": 29380 + }, + { + "epoch": 0.8403145103645461, + "grad_norm": 0.39133086800575256, + "learning_rate": 2.019707889549963e-06, + "loss": 0.0249, + "step": 29390 + }, + { + "epoch": 0.8406004288777699, + "grad_norm": 0.47845765948295593, + "learning_rate": 2.01906829188547e-06, + "loss": 0.0265, + "step": 29400 + }, + { + "epoch": 0.8408863473909935, + "grad_norm": 0.37436914443969727, + "learning_rate": 2.018439233844574e-06, + "loss": 0.0242, + "step": 29410 + }, + { + "epoch": 0.8411722659042173, + "grad_norm": 0.3853163719177246, + "learning_rate": 2.0178207161655087e-06, + "loss": 0.0235, + "step": 29420 + }, + { + "epoch": 0.841458184417441, + "grad_norm": 0.422373503446579, + "learning_rate": 2.0172127395741398e-06, + "loss": 0.0289, + "step": 29430 + }, + { + "epoch": 0.8417441029306647, + "grad_norm": 0.4003738462924957, + "learning_rate": 2.0166153047839603e-06, + "loss": 0.031, + "step": 29440 + }, + { + "epoch": 0.8420300214438885, + "grad_norm": 0.4747001528739929, + "learning_rate": 2.016028412496094e-06, + "loss": 0.0245, + "step": 29450 + }, + { + "epoch": 0.8423159399571122, + "grad_norm": 0.35080668330192566, + "learning_rate": 2.015452063399292e-06, + "loss": 0.0312, + "step": 29460 + }, + { + "epoch": 0.842601858470336, + "grad_norm": 0.406414270401001, + "learning_rate": 2.014886258169932e-06, + "loss": 0.0322, + "step": 29470 + }, + { + "epoch": 0.8428877769835597, + "grad_norm": 0.43083736300468445, + "learning_rate": 2.014330997472017e-06, + "loss": 0.0259, + "step": 29480 + }, + { + "epoch": 0.8431736954967834, + "grad_norm": 0.317290723323822, + "learning_rate": 2.013786281957177e-06, + "loss": 0.0258, + "step": 29490 + }, + { + "epoch": 0.8434596140100071, + "grad_norm": 0.38952547311782837, + "learning_rate": 2.0132521122646662e-06, + "loss": 0.0249, + "step": 29500 + }, + { + "epoch": 0.8437455325232309, + "grad_norm": 0.331080824136734, + "learning_rate": 2.0127284890213623e-06, + "loss": 0.0251, + "step": 29510 + }, + { + "epoch": 0.8440314510364546, + "grad_norm": 0.4097452163696289, + "learning_rate": 2.012215412841767e-06, + "loss": 0.0252, + "step": 29520 + }, + { + "epoch": 0.8443173695496783, + "grad_norm": 0.5636009573936462, + "learning_rate": 2.011712884328003e-06, + "loss": 0.0261, + "step": 29530 + }, + { + "epoch": 0.8446032880629021, + "grad_norm": 0.516639769077301, + "learning_rate": 2.011220904069815e-06, + "loss": 0.0318, + "step": 29540 + }, + { + "epoch": 0.8448892065761258, + "grad_norm": 0.3075452446937561, + "learning_rate": 2.01073947264457e-06, + "loss": 0.0229, + "step": 29550 + }, + { + "epoch": 0.8451751250893496, + "grad_norm": 0.38183867931365967, + "learning_rate": 2.0102685906172543e-06, + "loss": 0.0266, + "step": 29560 + }, + { + "epoch": 0.8454610436025732, + "grad_norm": 0.3114834129810333, + "learning_rate": 2.009808258540475e-06, + "loss": 0.0261, + "step": 29570 + }, + { + "epoch": 0.845746962115797, + "grad_norm": 1.277812123298645, + "learning_rate": 2.009358476954456e-06, + "loss": 0.0341, + "step": 29580 + }, + { + "epoch": 0.8460328806290207, + "grad_norm": 0.44270217418670654, + "learning_rate": 2.008919246387043e-06, + "loss": 0.0279, + "step": 29590 + }, + { + "epoch": 0.8463187991422445, + "grad_norm": 0.9557573199272156, + "learning_rate": 2.0084905673536952e-06, + "loss": 0.0283, + "step": 29600 + }, + { + "epoch": 0.8466047176554682, + "grad_norm": 0.6227599382400513, + "learning_rate": 2.0080724403574922e-06, + "loss": 0.0267, + "step": 29610 + }, + { + "epoch": 0.8468906361686919, + "grad_norm": 0.5279103517532349, + "learning_rate": 2.007664865889131e-06, + "loss": 0.0267, + "step": 29620 + }, + { + "epoch": 0.8471765546819157, + "grad_norm": 0.9109275937080383, + "learning_rate": 2.0072678444269208e-06, + "loss": 0.0336, + "step": 29630 + }, + { + "epoch": 0.8474624731951393, + "grad_norm": 0.26767420768737793, + "learning_rate": 2.006881376436789e-06, + "loss": 0.0249, + "step": 29640 + }, + { + "epoch": 0.8477483917083631, + "grad_norm": 0.3451564610004425, + "learning_rate": 2.0065054623722772e-06, + "loss": 0.0239, + "step": 29650 + }, + { + "epoch": 0.8480343102215868, + "grad_norm": 0.47498252987861633, + "learning_rate": 2.0061401026745425e-06, + "loss": 0.0309, + "step": 29660 + }, + { + "epoch": 0.8483202287348106, + "grad_norm": 0.49723026156425476, + "learning_rate": 2.005785297772354e-06, + "loss": 0.0277, + "step": 29670 + }, + { + "epoch": 0.8486061472480343, + "grad_norm": 0.365113228559494, + "learning_rate": 2.005441048082095e-06, + "loss": 0.0271, + "step": 29680 + }, + { + "epoch": 0.8488920657612581, + "grad_norm": 0.29341810941696167, + "learning_rate": 2.0051073540077617e-06, + "loss": 0.0272, + "step": 29690 + }, + { + "epoch": 0.8491779842744818, + "grad_norm": 0.4523782432079315, + "learning_rate": 2.0047842159409633e-06, + "loss": 0.0314, + "step": 29700 + }, + { + "epoch": 0.8494639027877054, + "grad_norm": 0.3393439054489136, + "learning_rate": 2.004471634260919e-06, + "loss": 0.0279, + "step": 29710 + }, + { + "epoch": 0.8497498213009292, + "grad_norm": 0.4606449007987976, + "learning_rate": 2.004169609334462e-06, + "loss": 0.0331, + "step": 29720 + }, + { + "epoch": 0.8500357398141529, + "grad_norm": 0.35131368041038513, + "learning_rate": 2.003878141516035e-06, + "loss": 0.0299, + "step": 29730 + }, + { + "epoch": 0.8503216583273767, + "grad_norm": 0.34658634662628174, + "learning_rate": 2.0035972311476916e-06, + "loss": 0.0288, + "step": 29740 + }, + { + "epoch": 0.8506075768406004, + "grad_norm": 0.3298470675945282, + "learning_rate": 2.0033268785590954e-06, + "loss": 0.0249, + "step": 29750 + }, + { + "epoch": 0.8508934953538242, + "grad_norm": 0.44585004448890686, + "learning_rate": 2.003067084067522e-06, + "loss": 0.0284, + "step": 29760 + }, + { + "epoch": 0.8511794138670479, + "grad_norm": 0.8015826940536499, + "learning_rate": 2.0028178479778523e-06, + "loss": 0.027, + "step": 29770 + }, + { + "epoch": 0.8514653323802717, + "grad_norm": 0.5542123913764954, + "learning_rate": 2.0025791705825805e-06, + "loss": 0.0261, + "step": 29780 + }, + { + "epoch": 0.8517512508934953, + "grad_norm": 0.5209313035011292, + "learning_rate": 2.0023510521618066e-06, + "loss": 0.0276, + "step": 29790 + }, + { + "epoch": 0.852037169406719, + "grad_norm": 0.3215568959712982, + "learning_rate": 2.0021334929832407e-06, + "loss": 0.0295, + "step": 29800 + }, + { + "epoch": 0.8523230879199428, + "grad_norm": 0.48849478363990784, + "learning_rate": 2.0019264933022016e-06, + "loss": 0.0247, + "step": 29810 + }, + { + "epoch": 0.8526090064331665, + "grad_norm": 0.32018014788627625, + "learning_rate": 2.001730053361614e-06, + "loss": 0.0241, + "step": 29820 + }, + { + "epoch": 0.8528949249463903, + "grad_norm": 0.5317611694335938, + "learning_rate": 2.0015441733920105e-06, + "loss": 0.0267, + "step": 29830 + }, + { + "epoch": 0.853180843459614, + "grad_norm": 0.691618025302887, + "learning_rate": 2.0013688536115332e-06, + "loss": 0.0304, + "step": 29840 + }, + { + "epoch": 0.8534667619728378, + "grad_norm": 0.4876650273799896, + "learning_rate": 2.0012040942259285e-06, + "loss": 0.0264, + "step": 29850 + }, + { + "epoch": 0.8537526804860615, + "grad_norm": 0.464668333530426, + "learning_rate": 2.0010498954285506e-06, + "loss": 0.0247, + "step": 29860 + }, + { + "epoch": 0.8540385989992852, + "grad_norm": 0.694965124130249, + "learning_rate": 2.00090625740036e-06, + "loss": 0.0254, + "step": 29870 + }, + { + "epoch": 0.8543245175125089, + "grad_norm": 0.48797327280044556, + "learning_rate": 2.0007731803099256e-06, + "loss": 0.0325, + "step": 29880 + }, + { + "epoch": 0.8546104360257326, + "grad_norm": 0.3835223913192749, + "learning_rate": 2.00065066431342e-06, + "loss": 0.0292, + "step": 29890 + }, + { + "epoch": 0.8548963545389564, + "grad_norm": 0.5236513614654541, + "learning_rate": 2.0005387095546222e-06, + "loss": 0.029, + "step": 29900 + }, + { + "epoch": 0.8551822730521801, + "grad_norm": 1.3250213861465454, + "learning_rate": 2.000437316164917e-06, + "loss": 0.0282, + "step": 29910 + }, + { + "epoch": 0.8554681915654039, + "grad_norm": 0.24530036747455597, + "learning_rate": 2.000346484263297e-06, + "loss": 0.0306, + "step": 29920 + }, + { + "epoch": 0.8557541100786276, + "grad_norm": 0.3535238802433014, + "learning_rate": 2.0002662139563564e-06, + "loss": 0.0252, + "step": 29930 + }, + { + "epoch": 0.8560400285918514, + "grad_norm": 0.5203806161880493, + "learning_rate": 2.0001965053382976e-06, + "loss": 0.0294, + "step": 29940 + }, + { + "epoch": 0.856325947105075, + "grad_norm": 0.6543874144554138, + "learning_rate": 2.000137358490928e-06, + "loss": 0.0284, + "step": 29950 + }, + { + "epoch": 0.8566118656182988, + "grad_norm": 0.40720510482788086, + "learning_rate": 2.0000887734836583e-06, + "loss": 0.0305, + "step": 29960 + }, + { + "epoch": 0.8568977841315225, + "grad_norm": 0.5478134155273438, + "learning_rate": 2.0000507503735076e-06, + "loss": 0.0281, + "step": 29970 + }, + { + "epoch": 0.8571837026447462, + "grad_norm": 0.35732871294021606, + "learning_rate": 2.0000232892050976e-06, + "loss": 0.0273, + "step": 29980 + }, + { + "epoch": 0.85746962115797, + "grad_norm": 0.39969056844711304, + "learning_rate": 2.000006390010655e-06, + "loss": 0.0229, + "step": 29990 + }, + { + "epoch": 0.8577555396711937, + "grad_norm": 0.3445800840854645, + "learning_rate": 2.0000000528100118e-06, + "loss": 0.0269, + "step": 30000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.87391671271424e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1176494009828ca1a8d623c603070781658572df --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": true, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/generation_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00001-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48788e9db87ec0a1f3c57369f97599281bb6ff59 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60e236c468d9fe78565b222177a5c1455250ac5e838df3cef20e7b974cdf5175 +size 4921072616 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00002-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..281677a827cfc618245396084c1f660afffe3179 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ce617a8c790ee1ea829e1ced51561b76027197dfcb3de42b9d9fb8d16fda107 +size 4978830984 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00003-of-00003.safetensors b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..114ee897cfc500507adcc5388cadcd6956a4a822 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccfabc16525ea4059e9cdabb977d5a0d0c668b97d1724617738818c4691368ae +size 4100977896 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model.safetensors.index.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/norm_stats.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..7a37358d95e92a337ffbc69008e6d3a514583ff2 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -15.553912042236327, + -29.199742523193358, + -19.58108451538086, + -2.290254103851318, + -3.98537020587921, + -3.326780859374999, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 20.256868560791013, + 29.94644501495361, + 21.81786548461914, + 2.931905368041992, + 5.064435471534729, + 3.8213318216323877, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 0.8829866647720337, + 2.0021812915802, + 0.2094610631465912, + 0.0940750315785408, + 0.0910087525844574, + 0.012966467998921871, + -0.09716881066560745, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.976093769073486, + 10.930583953857422, + 8.330232620239258, + 0.8605863451957703, + 1.5304595232009888, + 1.1747541427612305, + 0.995267927646637, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -14.624815139007566, + -31.510755078125, + -35.281760287475585, + -4.413841687011719, + -8.509904860687255, + -6.548201916885375, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 40.4127169593811, + 31.91034956970215, + 26.84413584289551, + 7.540738459014893, + 10.178268561553956, + 9.913993389892582, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 10.31286334991455, + 3.0421667098999023, + -4.947638511657715, + 0.41632387042045593, + -0.9987452030181885, + -0.18793217837810516, + -0.08814626932144165, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 10.463665962219238, + 14.231209754943848, + 11.03242301940918, + 2.1795010566711426, + 3.3540749549865723, + 2.708117961883545, + 0.9961075186729431, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/pi0.yaml b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/special_tokens_map.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer.model b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer_config.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/trainer_state.json b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..09165e17105fffc15636308b9fc167d630de0175 --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/trainer_state.json @@ -0,0 +1,21043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8577555396711937, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002859185132237312, + "grad_norm": 4.32843542098999, + "learning_rate": 1.8e-07, + "loss": 0.6022, + "step": 10 + }, + { + "epoch": 0.0005718370264474624, + "grad_norm": 5.184113502502441, + "learning_rate": 3.8e-07, + "loss": 0.6206, + "step": 20 + }, + { + "epoch": 0.0008577555396711937, + "grad_norm": 4.515527248382568, + "learning_rate": 5.800000000000001e-07, + "loss": 0.582, + "step": 30 + }, + { + "epoch": 0.0011436740528949249, + "grad_norm": 2.8382818698883057, + "learning_rate": 7.8e-07, + "loss": 0.544, + "step": 40 + }, + { + "epoch": 0.0014295925661186562, + "grad_norm": 4.019079208374023, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6381, + "step": 50 + }, + { + "epoch": 0.0017155110793423873, + "grad_norm": 2.9916157722473145, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5463, + "step": 60 + }, + { + "epoch": 0.0020014295925661185, + "grad_norm": 3.3288328647613525, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.446, + "step": 70 + }, + { + "epoch": 0.0022873481057898498, + "grad_norm": 3.181410312652588, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4497, + "step": 80 + }, + { + "epoch": 0.002573266619013581, + "grad_norm": 1.421942949295044, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.349, + "step": 90 + }, + { + "epoch": 0.0028591851322373124, + "grad_norm": 1.908596396446228, + "learning_rate": 1.98e-06, + "loss": 0.3338, + "step": 100 + }, + { + "epoch": 0.0031451036454610438, + "grad_norm": 1.8309729099273682, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2838, + "step": 110 + }, + { + "epoch": 0.0034310221586847747, + "grad_norm": 3.051408290863037, + "learning_rate": 2.38e-06, + "loss": 0.2418, + "step": 120 + }, + { + "epoch": 0.003716940671908506, + "grad_norm": 2.4083356857299805, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1726, + "step": 130 + }, + { + "epoch": 0.004002859185132237, + "grad_norm": 1.111687421798706, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.2164, + "step": 140 + }, + { + "epoch": 0.004288777698355968, + "grad_norm": 1.3874679803848267, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1312, + "step": 150 + }, + { + "epoch": 0.0045746962115796996, + "grad_norm": 1.2791540622711182, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1198, + "step": 160 + }, + { + "epoch": 0.004860614724803431, + "grad_norm": 1.6237181425094604, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1027, + "step": 170 + }, + { + "epoch": 0.005146533238027162, + "grad_norm": 0.9669432640075684, + "learning_rate": 3.58e-06, + "loss": 0.0968, + "step": 180 + }, + { + "epoch": 0.0054324517512508936, + "grad_norm": 1.4933182001113892, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.1012, + "step": 190 + }, + { + "epoch": 0.005718370264474625, + "grad_norm": 1.8615745306015015, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0901, + "step": 200 + }, + { + "epoch": 0.006004288777698356, + "grad_norm": 1.867163062095642, + "learning_rate": 4.18e-06, + "loss": 0.1067, + "step": 210 + }, + { + "epoch": 0.0062902072909220876, + "grad_norm": 1.199497103691101, + "learning_rate": 4.38e-06, + "loss": 0.0841, + "step": 220 + }, + { + "epoch": 0.006576125804145818, + "grad_norm": 1.1568272113800049, + "learning_rate": 4.58e-06, + "loss": 0.0951, + "step": 230 + }, + { + "epoch": 0.006862044317369549, + "grad_norm": 2.139226198196411, + "learning_rate": 4.78e-06, + "loss": 0.0845, + "step": 240 + }, + { + "epoch": 0.007147962830593281, + "grad_norm": 1.0357667207717896, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0828, + "step": 250 + }, + { + "epoch": 0.007433881343817012, + "grad_norm": 1.0145683288574219, + "learning_rate": 5.18e-06, + "loss": 0.0925, + "step": 260 + }, + { + "epoch": 0.007719799857040743, + "grad_norm": 1.308053731918335, + "learning_rate": 5.380000000000001e-06, + "loss": 0.082, + "step": 270 + }, + { + "epoch": 0.008005718370264474, + "grad_norm": 1.1561739444732666, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0888, + "step": 280 + }, + { + "epoch": 0.008291636883488206, + "grad_norm": 0.8777005672454834, + "learning_rate": 5.78e-06, + "loss": 0.0693, + "step": 290 + }, + { + "epoch": 0.008577555396711936, + "grad_norm": 0.9127368330955505, + "learning_rate": 5.98e-06, + "loss": 0.0823, + "step": 300 + }, + { + "epoch": 0.008863473909935669, + "grad_norm": 0.5608117580413818, + "learning_rate": 6.18e-06, + "loss": 0.0733, + "step": 310 + }, + { + "epoch": 0.009149392423159399, + "grad_norm": 1.9068444967269897, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0772, + "step": 320 + }, + { + "epoch": 0.009435310936383131, + "grad_norm": 0.9090886116027832, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.062, + "step": 330 + }, + { + "epoch": 0.009721229449606862, + "grad_norm": 1.191778540611267, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0718, + "step": 340 + }, + { + "epoch": 0.010007147962830594, + "grad_norm": 1.3743036985397339, + "learning_rate": 6.98e-06, + "loss": 0.0822, + "step": 350 + }, + { + "epoch": 0.010293066476054324, + "grad_norm": 1.4244364500045776, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0793, + "step": 360 + }, + { + "epoch": 0.010578984989278055, + "grad_norm": 1.1766910552978516, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0637, + "step": 370 + }, + { + "epoch": 0.010864903502501787, + "grad_norm": 1.1331329345703125, + "learning_rate": 7.58e-06, + "loss": 0.0705, + "step": 380 + }, + { + "epoch": 0.011150822015725518, + "grad_norm": 0.4898548424243927, + "learning_rate": 7.78e-06, + "loss": 0.0686, + "step": 390 + }, + { + "epoch": 0.01143674052894925, + "grad_norm": 0.7398406267166138, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0719, + "step": 400 + }, + { + "epoch": 0.01172265904217298, + "grad_norm": 1.1516162157058716, + "learning_rate": 8.18e-06, + "loss": 0.0696, + "step": 410 + }, + { + "epoch": 0.012008577555396712, + "grad_norm": 1.6034163236618042, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0698, + "step": 420 + }, + { + "epoch": 0.012294496068620443, + "grad_norm": 1.2195311784744263, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0574, + "step": 430 + }, + { + "epoch": 0.012580414581844175, + "grad_norm": 1.1106441020965576, + "learning_rate": 8.78e-06, + "loss": 0.0749, + "step": 440 + }, + { + "epoch": 0.012866333095067906, + "grad_norm": 1.1787506341934204, + "learning_rate": 8.98e-06, + "loss": 0.0718, + "step": 450 + }, + { + "epoch": 0.013152251608291636, + "grad_norm": 0.4380492568016052, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0692, + "step": 460 + }, + { + "epoch": 0.013438170121515368, + "grad_norm": 1.0138392448425293, + "learning_rate": 9.38e-06, + "loss": 0.0718, + "step": 470 + }, + { + "epoch": 0.013724088634739099, + "grad_norm": 0.50003582239151, + "learning_rate": 9.58e-06, + "loss": 0.078, + "step": 480 + }, + { + "epoch": 0.014010007147962831, + "grad_norm": 0.6253323554992676, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0762, + "step": 490 + }, + { + "epoch": 0.014295925661186561, + "grad_norm": 0.6725791096687317, + "learning_rate": 9.980000000000001e-06, + "loss": 0.0615, + "step": 500 + }, + { + "epoch": 0.014581844174410294, + "grad_norm": 0.6100206971168518, + "learning_rate": 1.018e-05, + "loss": 0.0576, + "step": 510 + }, + { + "epoch": 0.014867762687634024, + "grad_norm": 1.9225071668624878, + "learning_rate": 1.038e-05, + "loss": 0.0957, + "step": 520 + }, + { + "epoch": 0.015153681200857756, + "grad_norm": 1.304625391960144, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0644, + "step": 530 + }, + { + "epoch": 0.015439599714081487, + "grad_norm": 0.7657200694084167, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0612, + "step": 540 + }, + { + "epoch": 0.015725518227305217, + "grad_norm": 0.7371220588684082, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0719, + "step": 550 + }, + { + "epoch": 0.016011436740528948, + "grad_norm": 0.7274985313415527, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0594, + "step": 560 + }, + { + "epoch": 0.01629735525375268, + "grad_norm": 1.3222947120666504, + "learning_rate": 1.138e-05, + "loss": 0.0655, + "step": 570 + }, + { + "epoch": 0.016583273766976412, + "grad_norm": 0.965411901473999, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0638, + "step": 580 + }, + { + "epoch": 0.016869192280200142, + "grad_norm": 0.8161532878875732, + "learning_rate": 1.178e-05, + "loss": 0.0532, + "step": 590 + }, + { + "epoch": 0.017155110793423873, + "grad_norm": 0.8228808045387268, + "learning_rate": 1.198e-05, + "loss": 0.051, + "step": 600 + }, + { + "epoch": 0.017441029306647607, + "grad_norm": 0.6932743191719055, + "learning_rate": 1.218e-05, + "loss": 0.0595, + "step": 610 + }, + { + "epoch": 0.017726947819871337, + "grad_norm": 0.6848511099815369, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0589, + "step": 620 + }, + { + "epoch": 0.018012866333095068, + "grad_norm": 1.137454867362976, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0694, + "step": 630 + }, + { + "epoch": 0.018298784846318798, + "grad_norm": 0.8087878227233887, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0738, + "step": 640 + }, + { + "epoch": 0.01858470335954253, + "grad_norm": 0.8093737363815308, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.058, + "step": 650 + }, + { + "epoch": 0.018870621872766263, + "grad_norm": 0.8387401700019836, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.0686, + "step": 660 + }, + { + "epoch": 0.019156540385989993, + "grad_norm": 1.1544110774993896, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0592, + "step": 670 + }, + { + "epoch": 0.019442458899213724, + "grad_norm": 0.8208314180374146, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0565, + "step": 680 + }, + { + "epoch": 0.019728377412437454, + "grad_norm": 0.97088623046875, + "learning_rate": 1.378e-05, + "loss": 0.0675, + "step": 690 + }, + { + "epoch": 0.020014295925661188, + "grad_norm": 1.0991814136505127, + "learning_rate": 1.398e-05, + "loss": 0.0745, + "step": 700 + }, + { + "epoch": 0.02030021443888492, + "grad_norm": 0.9467299580574036, + "learning_rate": 1.418e-05, + "loss": 0.0645, + "step": 710 + }, + { + "epoch": 0.02058613295210865, + "grad_norm": 0.4910801351070404, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0466, + "step": 720 + }, + { + "epoch": 0.02087205146533238, + "grad_norm": 1.0102845430374146, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0735, + "step": 730 + }, + { + "epoch": 0.02115796997855611, + "grad_norm": 0.9033467769622803, + "learning_rate": 1.478e-05, + "loss": 0.0741, + "step": 740 + }, + { + "epoch": 0.021443888491779844, + "grad_norm": 1.6092171669006348, + "learning_rate": 1.498e-05, + "loss": 0.0737, + "step": 750 + }, + { + "epoch": 0.021729807005003574, + "grad_norm": 0.7047333717346191, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0604, + "step": 760 + }, + { + "epoch": 0.022015725518227305, + "grad_norm": 1.2015491724014282, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0646, + "step": 770 + }, + { + "epoch": 0.022301644031451035, + "grad_norm": 1.1669623851776123, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.0587, + "step": 780 + }, + { + "epoch": 0.02258756254467477, + "grad_norm": 1.137113094329834, + "learning_rate": 1.578e-05, + "loss": 0.0692, + "step": 790 + }, + { + "epoch": 0.0228734810578985, + "grad_norm": 1.269505262374878, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0711, + "step": 800 + }, + { + "epoch": 0.02315939957112223, + "grad_norm": 0.942534863948822, + "learning_rate": 1.618e-05, + "loss": 0.0782, + "step": 810 + }, + { + "epoch": 0.02344531808434596, + "grad_norm": 0.9548556208610535, + "learning_rate": 1.638e-05, + "loss": 0.0814, + "step": 820 + }, + { + "epoch": 0.02373123659756969, + "grad_norm": 1.0210421085357666, + "learning_rate": 1.658e-05, + "loss": 0.0774, + "step": 830 + }, + { + "epoch": 0.024017155110793425, + "grad_norm": 1.0955135822296143, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0693, + "step": 840 + }, + { + "epoch": 0.024303073624017155, + "grad_norm": 1.2081682682037354, + "learning_rate": 1.698e-05, + "loss": 0.0589, + "step": 850 + }, + { + "epoch": 0.024588992137240886, + "grad_norm": 0.9728164076805115, + "learning_rate": 1.718e-05, + "loss": 0.0585, + "step": 860 + }, + { + "epoch": 0.024874910650464616, + "grad_norm": 1.310244083404541, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.066, + "step": 870 + }, + { + "epoch": 0.02516082916368835, + "grad_norm": 0.8860681653022766, + "learning_rate": 1.758e-05, + "loss": 0.0703, + "step": 880 + }, + { + "epoch": 0.02544674767691208, + "grad_norm": 2.1878466606140137, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0913, + "step": 890 + }, + { + "epoch": 0.02573266619013581, + "grad_norm": 0.6659205555915833, + "learning_rate": 1.798e-05, + "loss": 0.0603, + "step": 900 + }, + { + "epoch": 0.02601858470335954, + "grad_norm": 0.6700656414031982, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.074, + "step": 910 + }, + { + "epoch": 0.026304503216583272, + "grad_norm": 0.8292778134346008, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0529, + "step": 920 + }, + { + "epoch": 0.026590421729807006, + "grad_norm": 0.9637550115585327, + "learning_rate": 1.858e-05, + "loss": 0.0604, + "step": 930 + }, + { + "epoch": 0.026876340243030736, + "grad_norm": 0.4605652689933777, + "learning_rate": 1.878e-05, + "loss": 0.0657, + "step": 940 + }, + { + "epoch": 0.027162258756254467, + "grad_norm": 1.3346972465515137, + "learning_rate": 1.898e-05, + "loss": 0.0576, + "step": 950 + }, + { + "epoch": 0.027448177269478197, + "grad_norm": 0.8369432091712952, + "learning_rate": 1.918e-05, + "loss": 0.0567, + "step": 960 + }, + { + "epoch": 0.02773409578270193, + "grad_norm": 0.613459050655365, + "learning_rate": 1.938e-05, + "loss": 0.0523, + "step": 970 + }, + { + "epoch": 0.028020014295925662, + "grad_norm": 1.402799367904663, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0794, + "step": 980 + }, + { + "epoch": 0.028305932809149392, + "grad_norm": 1.1603201627731323, + "learning_rate": 1.978e-05, + "loss": 0.0583, + "step": 990 + }, + { + "epoch": 0.028591851322373123, + "grad_norm": 0.8101517558097839, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0584, + "step": 1000 + }, + { + "epoch": 0.028877769835596853, + "grad_norm": 1.060592770576477, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.077, + "step": 1010 + }, + { + "epoch": 0.029163688348820587, + "grad_norm": 1.2096195220947266, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.066, + "step": 1020 + }, + { + "epoch": 0.029449606862044318, + "grad_norm": 1.0035862922668457, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0625, + "step": 1030 + }, + { + "epoch": 0.029735525375268048, + "grad_norm": 0.44185084104537964, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0611, + "step": 1040 + }, + { + "epoch": 0.03002144388849178, + "grad_norm": 1.209908127784729, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0693, + "step": 1050 + }, + { + "epoch": 0.030307362401715512, + "grad_norm": 0.9716938138008118, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0706, + "step": 1060 + }, + { + "epoch": 0.030593280914939243, + "grad_norm": 0.8310994505882263, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0622, + "step": 1070 + }, + { + "epoch": 0.030879199428162973, + "grad_norm": 0.8737888932228088, + "learning_rate": 1.999967041472886e-05, + "loss": 0.0564, + "step": 1080 + }, + { + "epoch": 0.031165117941386704, + "grad_norm": 0.7609763145446777, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.0552, + "step": 1090 + }, + { + "epoch": 0.031451036454610434, + "grad_norm": 0.6319764256477356, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0593, + "step": 1100 + }, + { + "epoch": 0.031736954967834165, + "grad_norm": 0.5562251806259155, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0553, + "step": 1110 + }, + { + "epoch": 0.032022873481057895, + "grad_norm": 1.3476046323776245, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0805, + "step": 1120 + }, + { + "epoch": 0.03230879199428163, + "grad_norm": 0.5449394583702087, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0666, + "step": 1130 + }, + { + "epoch": 0.03259471050750536, + "grad_norm": 0.8675817251205444, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0703, + "step": 1140 + }, + { + "epoch": 0.032880629020729094, + "grad_norm": 0.8713150024414062, + "learning_rate": 1.999882759038658e-05, + "loss": 0.063, + "step": 1150 + }, + { + "epoch": 0.033166547533952824, + "grad_norm": 0.7205761075019836, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0597, + "step": 1160 + }, + { + "epoch": 0.033452466047176554, + "grad_norm": 0.482741117477417, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0558, + "step": 1170 + }, + { + "epoch": 0.033738384560400285, + "grad_norm": 0.8652167320251465, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0559, + "step": 1180 + }, + { + "epoch": 0.034024303073624015, + "grad_norm": 0.5286755561828613, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0673, + "step": 1190 + }, + { + "epoch": 0.034310221586847746, + "grad_norm": 0.9883217215538025, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0609, + "step": 1200 + }, + { + "epoch": 0.034596140100071476, + "grad_norm": 0.7700253129005432, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0703, + "step": 1210 + }, + { + "epoch": 0.034882058613295214, + "grad_norm": 0.8669867515563965, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.0577, + "step": 1220 + }, + { + "epoch": 0.035167977126518944, + "grad_norm": 0.8856104016304016, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0599, + "step": 1230 + }, + { + "epoch": 0.035453895639742675, + "grad_norm": 0.5517004728317261, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0791, + "step": 1240 + }, + { + "epoch": 0.035739814152966405, + "grad_norm": 0.7505853176116943, + "learning_rate": 1.999672592499692e-05, + "loss": 0.086, + "step": 1250 + }, + { + "epoch": 0.036025732666190136, + "grad_norm": 0.7412230968475342, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0534, + "step": 1260 + }, + { + "epoch": 0.036311651179413866, + "grad_norm": 0.6629419922828674, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0607, + "step": 1270 + }, + { + "epoch": 0.036597569692637597, + "grad_norm": 0.7081887125968933, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0513, + "step": 1280 + }, + { + "epoch": 0.03688348820586133, + "grad_norm": 0.8555129766464233, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0574, + "step": 1290 + }, + { + "epoch": 0.03716940671908506, + "grad_norm": 0.5992563366889954, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0529, + "step": 1300 + }, + { + "epoch": 0.037455325232308795, + "grad_norm": 0.8527185320854187, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.0588, + "step": 1310 + }, + { + "epoch": 0.037741243745532525, + "grad_norm": 1.078600525856018, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0644, + "step": 1320 + }, + { + "epoch": 0.038027162258756256, + "grad_norm": 0.8158502578735352, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0587, + "step": 1330 + }, + { + "epoch": 0.038313080771979986, + "grad_norm": 1.011278748512268, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0696, + "step": 1340 + }, + { + "epoch": 0.03859899928520372, + "grad_norm": 0.806888222694397, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0588, + "step": 1350 + }, + { + "epoch": 0.03888491779842745, + "grad_norm": 0.7776031494140625, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0461, + "step": 1360 + }, + { + "epoch": 0.03917083631165118, + "grad_norm": 0.6119349598884583, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.0566, + "step": 1370 + }, + { + "epoch": 0.03945675482487491, + "grad_norm": 0.6168059706687927, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0636, + "step": 1380 + }, + { + "epoch": 0.03974267333809864, + "grad_norm": 0.8180692195892334, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0596, + "step": 1390 + }, + { + "epoch": 0.040028591851322376, + "grad_norm": 0.6775726079940796, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0586, + "step": 1400 + }, + { + "epoch": 0.040314510364546106, + "grad_norm": 0.7446377873420715, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.057, + "step": 1410 + }, + { + "epoch": 0.04060042887776984, + "grad_norm": 0.9334514737129211, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0551, + "step": 1420 + }, + { + "epoch": 0.04088634739099357, + "grad_norm": 1.481874942779541, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0664, + "step": 1430 + }, + { + "epoch": 0.0411722659042173, + "grad_norm": 0.9553850889205933, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0645, + "step": 1440 + }, + { + "epoch": 0.04145818441744103, + "grad_norm": 0.8824119567871094, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0694, + "step": 1450 + }, + { + "epoch": 0.04174410293066476, + "grad_norm": 1.0382661819458008, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0683, + "step": 1460 + }, + { + "epoch": 0.04203002144388849, + "grad_norm": 0.5914127826690674, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0652, + "step": 1470 + }, + { + "epoch": 0.04231593995711222, + "grad_norm": 0.8497964143753052, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0706, + "step": 1480 + }, + { + "epoch": 0.04260185847033596, + "grad_norm": 0.897759199142456, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0647, + "step": 1490 + }, + { + "epoch": 0.04288777698355969, + "grad_norm": 1.1102443933486938, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0579, + "step": 1500 + }, + { + "epoch": 0.04317369549678342, + "grad_norm": 0.7638678550720215, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0606, + "step": 1510 + }, + { + "epoch": 0.04345961401000715, + "grad_norm": 0.6662708520889282, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.067, + "step": 1520 + }, + { + "epoch": 0.04374553252323088, + "grad_norm": 0.4957924485206604, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0685, + "step": 1530 + }, + { + "epoch": 0.04403145103645461, + "grad_norm": 0.6456794738769531, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0665, + "step": 1540 + }, + { + "epoch": 0.04431736954967834, + "grad_norm": 1.1598498821258545, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0527, + "step": 1550 + }, + { + "epoch": 0.04460328806290207, + "grad_norm": 0.931520938873291, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0682, + "step": 1560 + }, + { + "epoch": 0.0448892065761258, + "grad_norm": 0.7289925813674927, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0726, + "step": 1570 + }, + { + "epoch": 0.04517512508934954, + "grad_norm": 0.5471235513687134, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.0561, + "step": 1580 + }, + { + "epoch": 0.04546104360257327, + "grad_norm": 0.8686550259590149, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0552, + "step": 1590 + }, + { + "epoch": 0.045746962115797, + "grad_norm": 1.1767120361328125, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0544, + "step": 1600 + }, + { + "epoch": 0.04603288062902073, + "grad_norm": 0.8729729056358337, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0625, + "step": 1610 + }, + { + "epoch": 0.04631879914224446, + "grad_norm": 1.3734601736068726, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0667, + "step": 1620 + }, + { + "epoch": 0.04660471765546819, + "grad_norm": 0.6810682415962219, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0522, + "step": 1630 + }, + { + "epoch": 0.04689063616869192, + "grad_norm": 0.7744873762130737, + "learning_rate": 1.997844517262844e-05, + "loss": 0.06, + "step": 1640 + }, + { + "epoch": 0.04717655468191565, + "grad_norm": 1.000954270362854, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0606, + "step": 1650 + }, + { + "epoch": 0.04746247319513938, + "grad_norm": 0.8105701208114624, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0511, + "step": 1660 + }, + { + "epoch": 0.04774839170836312, + "grad_norm": 0.9504240155220032, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0833, + "step": 1670 + }, + { + "epoch": 0.04803431022158685, + "grad_norm": 0.910836935043335, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0582, + "step": 1680 + }, + { + "epoch": 0.04832022873481058, + "grad_norm": 0.5865645408630371, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0657, + "step": 1690 + }, + { + "epoch": 0.04860614724803431, + "grad_norm": 1.0098698139190674, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0661, + "step": 1700 + }, + { + "epoch": 0.04889206576125804, + "grad_norm": 0.8097764253616333, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0563, + "step": 1710 + }, + { + "epoch": 0.04917798427448177, + "grad_norm": 0.9958128333091736, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0597, + "step": 1720 + }, + { + "epoch": 0.0494639027877055, + "grad_norm": 0.8471905589103699, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0493, + "step": 1730 + }, + { + "epoch": 0.04974982130092923, + "grad_norm": 0.647058367729187, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0508, + "step": 1740 + }, + { + "epoch": 0.05003573981415296, + "grad_norm": 1.0832161903381348, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.052, + "step": 1750 + }, + { + "epoch": 0.0503216583273767, + "grad_norm": 0.8469381332397461, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0615, + "step": 1760 + }, + { + "epoch": 0.05060757684060043, + "grad_norm": 0.5371052622795105, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0556, + "step": 1770 + }, + { + "epoch": 0.05089349535382416, + "grad_norm": 0.9016183614730835, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0561, + "step": 1780 + }, + { + "epoch": 0.05117941386704789, + "grad_norm": 0.8829526305198669, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0648, + "step": 1790 + }, + { + "epoch": 0.05146533238027162, + "grad_norm": 1.079738974571228, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0577, + "step": 1800 + }, + { + "epoch": 0.05175125089349535, + "grad_norm": 0.7496556639671326, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.052, + "step": 1810 + }, + { + "epoch": 0.05203716940671908, + "grad_norm": 0.7587016820907593, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0634, + "step": 1820 + }, + { + "epoch": 0.052323087919942814, + "grad_norm": 0.9622246623039246, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0543, + "step": 1830 + }, + { + "epoch": 0.052609006433166544, + "grad_norm": 0.6643623113632202, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0606, + "step": 1840 + }, + { + "epoch": 0.05289492494639028, + "grad_norm": 0.8060843348503113, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0562, + "step": 1850 + }, + { + "epoch": 0.05318084345961401, + "grad_norm": 0.7353034019470215, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0679, + "step": 1860 + }, + { + "epoch": 0.05346676197283774, + "grad_norm": 0.6636782288551331, + "learning_rate": 1.996014938229576e-05, + "loss": 0.0561, + "step": 1870 + }, + { + "epoch": 0.05375268048606147, + "grad_norm": 0.6760010719299316, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0674, + "step": 1880 + }, + { + "epoch": 0.0540385989992852, + "grad_norm": 0.7144591808319092, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0551, + "step": 1890 + }, + { + "epoch": 0.054324517512508934, + "grad_norm": 0.8346575498580933, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.049, + "step": 1900 + }, + { + "epoch": 0.054610436025732664, + "grad_norm": 1.1682871580123901, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0554, + "step": 1910 + }, + { + "epoch": 0.054896354538956395, + "grad_norm": 0.9150840640068054, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0549, + "step": 1920 + }, + { + "epoch": 0.055182273052180125, + "grad_norm": 0.37064746022224426, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0547, + "step": 1930 + }, + { + "epoch": 0.05546819156540386, + "grad_norm": 1.1214783191680908, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0697, + "step": 1940 + }, + { + "epoch": 0.05575411007862759, + "grad_norm": 0.8259853720664978, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0614, + "step": 1950 + }, + { + "epoch": 0.056040028591851324, + "grad_norm": 1.2340948581695557, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0811, + "step": 1960 + }, + { + "epoch": 0.056325947105075054, + "grad_norm": 0.8764797449111938, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0605, + "step": 1970 + }, + { + "epoch": 0.056611865618298784, + "grad_norm": 0.770044207572937, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0481, + "step": 1980 + }, + { + "epoch": 0.056897784131522515, + "grad_norm": 1.333876132965088, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0729, + "step": 1990 + }, + { + "epoch": 0.057183702644746245, + "grad_norm": 0.5231258273124695, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.051, + "step": 2000 + }, + { + "epoch": 0.057469621157969976, + "grad_norm": 1.1937541961669922, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.061, + "step": 2010 + }, + { + "epoch": 0.057755539671193706, + "grad_norm": 0.7843487858772278, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0688, + "step": 2020 + }, + { + "epoch": 0.058041458184417444, + "grad_norm": 0.7956593632698059, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0878, + "step": 2030 + }, + { + "epoch": 0.058327376697641174, + "grad_norm": 0.5006444454193115, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0588, + "step": 2040 + }, + { + "epoch": 0.058613295210864905, + "grad_norm": 1.162245750427246, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.0619, + "step": 2050 + }, + { + "epoch": 0.058899213724088635, + "grad_norm": 0.46943384408950806, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0584, + "step": 2060 + }, + { + "epoch": 0.059185132237312366, + "grad_norm": 0.3780323266983032, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0462, + "step": 2070 + }, + { + "epoch": 0.059471050750536096, + "grad_norm": 0.7066171765327454, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0652, + "step": 2080 + }, + { + "epoch": 0.05975696926375983, + "grad_norm": 0.8464685082435608, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0546, + "step": 2090 + }, + { + "epoch": 0.06004288777698356, + "grad_norm": 0.7198944687843323, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0534, + "step": 2100 + }, + { + "epoch": 0.06032880629020729, + "grad_norm": 0.7136557698249817, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0665, + "step": 2110 + }, + { + "epoch": 0.060614724803431025, + "grad_norm": 0.8739225268363953, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.0542, + "step": 2120 + }, + { + "epoch": 0.060900643316654755, + "grad_norm": 0.6694063544273376, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0575, + "step": 2130 + }, + { + "epoch": 0.061186561829878486, + "grad_norm": 0.4805296063423157, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0578, + "step": 2140 + }, + { + "epoch": 0.061472480343102216, + "grad_norm": 0.758660078048706, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0455, + "step": 2150 + }, + { + "epoch": 0.06175839885632595, + "grad_norm": 0.8114968538284302, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0801, + "step": 2160 + }, + { + "epoch": 0.06204431736954968, + "grad_norm": 0.6585670113563538, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0564, + "step": 2170 + }, + { + "epoch": 0.06233023588277341, + "grad_norm": 1.2986794710159302, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0595, + "step": 2180 + }, + { + "epoch": 0.06261615439599715, + "grad_norm": 0.9822471141815186, + "learning_rate": 1.992544454099507e-05, + "loss": 0.0515, + "step": 2190 + }, + { + "epoch": 0.06290207290922087, + "grad_norm": 0.8112025260925293, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0585, + "step": 2200 + }, + { + "epoch": 0.0631879914224446, + "grad_norm": 0.6239551305770874, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0641, + "step": 2210 + }, + { + "epoch": 0.06347390993566833, + "grad_norm": 0.8405657410621643, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.057, + "step": 2220 + }, + { + "epoch": 0.06375982844889207, + "grad_norm": 0.4925670623779297, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0568, + "step": 2230 + }, + { + "epoch": 0.06404574696211579, + "grad_norm": 0.8599978089332581, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0587, + "step": 2240 + }, + { + "epoch": 0.06433166547533953, + "grad_norm": 0.8657258749008179, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0628, + "step": 2250 + }, + { + "epoch": 0.06461758398856327, + "grad_norm": 0.5826218128204346, + "learning_rate": 1.991642153373178e-05, + "loss": 0.055, + "step": 2260 + }, + { + "epoch": 0.06490350250178699, + "grad_norm": 0.9265853762626648, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.0533, + "step": 2270 + }, + { + "epoch": 0.06518942101501073, + "grad_norm": 0.8345134258270264, + "learning_rate": 1.991374933341515e-05, + "loss": 0.064, + "step": 2280 + }, + { + "epoch": 0.06547533952823445, + "grad_norm": 0.6610177755355835, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0553, + "step": 2290 + }, + { + "epoch": 0.06576125804145819, + "grad_norm": 0.8541404604911804, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0483, + "step": 2300 + }, + { + "epoch": 0.06604717655468191, + "grad_norm": 0.9029123187065125, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0517, + "step": 2310 + }, + { + "epoch": 0.06633309506790565, + "grad_norm": 0.614111602306366, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.06661901358112937, + "grad_norm": 0.8723806142807007, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0511, + "step": 2330 + }, + { + "epoch": 0.06690493209435311, + "grad_norm": 0.5288586020469666, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0474, + "step": 2340 + }, + { + "epoch": 0.06719085060757685, + "grad_norm": 0.6346511840820312, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0532, + "step": 2350 + }, + { + "epoch": 0.06747676912080057, + "grad_norm": 0.9112687706947327, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0583, + "step": 2360 + }, + { + "epoch": 0.06776268763402431, + "grad_norm": 0.6879385113716125, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0551, + "step": 2370 + }, + { + "epoch": 0.06804860614724803, + "grad_norm": 0.6945562958717346, + "learning_rate": 1.989976094288735e-05, + "loss": 0.053, + "step": 2380 + }, + { + "epoch": 0.06833452466047177, + "grad_norm": 0.6774301528930664, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.0596, + "step": 2390 + }, + { + "epoch": 0.06862044317369549, + "grad_norm": 0.7311446070671082, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0576, + "step": 2400 + }, + { + "epoch": 0.06890636168691923, + "grad_norm": 0.9301936030387878, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0608, + "step": 2410 + }, + { + "epoch": 0.06919228020014295, + "grad_norm": 1.1750341653823853, + "learning_rate": 1.989387305123247e-05, + "loss": 0.059, + "step": 2420 + }, + { + "epoch": 0.06947819871336669, + "grad_norm": 0.716266930103302, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.053, + "step": 2430 + }, + { + "epoch": 0.06976411722659043, + "grad_norm": 0.8549973964691162, + "learning_rate": 1.989086647373215e-05, + "loss": 0.061, + "step": 2440 + }, + { + "epoch": 0.07005003573981415, + "grad_norm": 0.7306638360023499, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0595, + "step": 2450 + }, + { + "epoch": 0.07033595425303789, + "grad_norm": 1.2529624700546265, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0597, + "step": 2460 + }, + { + "epoch": 0.07062187276626161, + "grad_norm": 0.7199717164039612, + "learning_rate": 1.988627835751598e-05, + "loss": 0.047, + "step": 2470 + }, + { + "epoch": 0.07090779127948535, + "grad_norm": 0.8007253408432007, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0544, + "step": 2480 + }, + { + "epoch": 0.07119370979270907, + "grad_norm": 0.7852535843849182, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0507, + "step": 2490 + }, + { + "epoch": 0.07147962830593281, + "grad_norm": 1.0649739503860474, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.058, + "step": 2500 + }, + { + "epoch": 0.07176554681915653, + "grad_norm": 0.8080071806907654, + "learning_rate": 1.988001487826387e-05, + "loss": 0.059, + "step": 2510 + }, + { + "epoch": 0.07205146533238027, + "grad_norm": 0.49453601241111755, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0522, + "step": 2520 + }, + { + "epoch": 0.07233738384560401, + "grad_norm": 0.7618975639343262, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0557, + "step": 2530 + }, + { + "epoch": 0.07262330235882773, + "grad_norm": 0.6284596920013428, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.058, + "step": 2540 + }, + { + "epoch": 0.07290922087205147, + "grad_norm": 1.6536812782287598, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0535, + "step": 2550 + }, + { + "epoch": 0.07319513938527519, + "grad_norm": 0.6516987681388855, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.061, + "step": 2560 + }, + { + "epoch": 0.07348105789849893, + "grad_norm": 0.7660441398620605, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0603, + "step": 2570 + }, + { + "epoch": 0.07376697641172265, + "grad_norm": 0.7900884747505188, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0494, + "step": 2580 + }, + { + "epoch": 0.07405289492494639, + "grad_norm": 0.9578459858894348, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0492, + "step": 2590 + }, + { + "epoch": 0.07433881343817011, + "grad_norm": 0.5268751978874207, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0524, + "step": 2600 + }, + { + "epoch": 0.07462473195139385, + "grad_norm": 0.8935990929603577, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0537, + "step": 2610 + }, + { + "epoch": 0.07491065046461759, + "grad_norm": 0.940441370010376, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0545, + "step": 2620 + }, + { + "epoch": 0.07519656897784131, + "grad_norm": 0.42767468094825745, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0508, + "step": 2630 + }, + { + "epoch": 0.07548248749106505, + "grad_norm": 0.6892207860946655, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0417, + "step": 2640 + }, + { + "epoch": 0.07576840600428877, + "grad_norm": 1.2622859477996826, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0665, + "step": 2650 + }, + { + "epoch": 0.07605432451751251, + "grad_norm": 0.8809115290641785, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0617, + "step": 2660 + }, + { + "epoch": 0.07634024303073624, + "grad_norm": 0.604371190071106, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.0577, + "step": 2670 + }, + { + "epoch": 0.07662616154395997, + "grad_norm": 0.7091525793075562, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0552, + "step": 2680 + }, + { + "epoch": 0.0769120800571837, + "grad_norm": 0.7841326594352722, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0579, + "step": 2690 + }, + { + "epoch": 0.07719799857040743, + "grad_norm": 0.7789046764373779, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0511, + "step": 2700 + }, + { + "epoch": 0.07748391708363117, + "grad_norm": 0.6497660875320435, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0532, + "step": 2710 + }, + { + "epoch": 0.0777698355968549, + "grad_norm": 0.6902356147766113, + "learning_rate": 1.984439891859038e-05, + "loss": 0.06, + "step": 2720 + }, + { + "epoch": 0.07805575411007863, + "grad_norm": 0.5721703767776489, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0442, + "step": 2730 + }, + { + "epoch": 0.07834167262330236, + "grad_norm": 0.5205336809158325, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0551, + "step": 2740 + }, + { + "epoch": 0.07862759113652609, + "grad_norm": 1.0646073818206787, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0546, + "step": 2750 + }, + { + "epoch": 0.07891350964974982, + "grad_norm": 0.6809906363487244, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0603, + "step": 2760 + }, + { + "epoch": 0.07919942816297355, + "grad_norm": 0.7592756152153015, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0611, + "step": 2770 + }, + { + "epoch": 0.07948534667619728, + "grad_norm": 0.970733106136322, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.066, + "step": 2780 + }, + { + "epoch": 0.07977126518942101, + "grad_norm": 0.5978608727455139, + "learning_rate": 1.983150881656814e-05, + "loss": 0.049, + "step": 2790 + }, + { + "epoch": 0.08005718370264475, + "grad_norm": 0.6761397123336792, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.048, + "step": 2800 + }, + { + "epoch": 0.08034310221586848, + "grad_norm": 0.9752228856086731, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0661, + "step": 2810 + }, + { + "epoch": 0.08062902072909221, + "grad_norm": 0.8727124929428101, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0629, + "step": 2820 + }, + { + "epoch": 0.08091493924231594, + "grad_norm": 0.8425240516662598, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0517, + "step": 2830 + }, + { + "epoch": 0.08120085775553967, + "grad_norm": 0.7011470198631287, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0484, + "step": 2840 + }, + { + "epoch": 0.0814867762687634, + "grad_norm": 0.836200475692749, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0504, + "step": 2850 + }, + { + "epoch": 0.08177269478198713, + "grad_norm": 0.4431964159011841, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0527, + "step": 2860 + }, + { + "epoch": 0.08205861329521086, + "grad_norm": 0.4666791260242462, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.0556, + "step": 2870 + }, + { + "epoch": 0.0823445318084346, + "grad_norm": 0.5705346465110779, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0544, + "step": 2880 + }, + { + "epoch": 0.08263045032165833, + "grad_norm": 1.7237486839294434, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0555, + "step": 2890 + }, + { + "epoch": 0.08291636883488206, + "grad_norm": 0.9305147528648376, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.055, + "step": 2900 + }, + { + "epoch": 0.0832022873481058, + "grad_norm": 1.3475992679595947, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0723, + "step": 2910 + }, + { + "epoch": 0.08348820586132952, + "grad_norm": 0.7196787595748901, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0581, + "step": 2920 + }, + { + "epoch": 0.08377412437455325, + "grad_norm": 0.4567016363143921, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0633, + "step": 2930 + }, + { + "epoch": 0.08406004288777698, + "grad_norm": 0.8537796139717102, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0589, + "step": 2940 + }, + { + "epoch": 0.08434596140100072, + "grad_norm": 0.9526864886283875, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0581, + "step": 2950 + }, + { + "epoch": 0.08463187991422444, + "grad_norm": 0.8753517866134644, + "learning_rate": 1.979809151602651e-05, + "loss": 0.066, + "step": 2960 + }, + { + "epoch": 0.08491779842744818, + "grad_norm": 0.9062561988830566, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0472, + "step": 2970 + }, + { + "epoch": 0.08520371694067191, + "grad_norm": 1.0018329620361328, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0584, + "step": 2980 + }, + { + "epoch": 0.08548963545389564, + "grad_norm": 1.0577157735824585, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.05, + "step": 2990 + }, + { + "epoch": 0.08577555396711938, + "grad_norm": 1.0216799974441528, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0703, + "step": 3000 + }, + { + "epoch": 0.0860614724803431, + "grad_norm": 0.5581191778182983, + "learning_rate": 1.978769450291435e-05, + "loss": 0.0682, + "step": 3010 + }, + { + "epoch": 0.08634739099356684, + "grad_norm": 0.6187682151794434, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0575, + "step": 3020 + }, + { + "epoch": 0.08663330950679056, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.978346349055984e-05, + "loss": 0.0565, + "step": 3030 + }, + { + "epoch": 0.0869192280200143, + "grad_norm": 0.8952509760856628, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0615, + "step": 3040 + }, + { + "epoch": 0.08720514653323802, + "grad_norm": 0.7387855648994446, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.0434, + "step": 3050 + }, + { + "epoch": 0.08749106504646176, + "grad_norm": 0.8661363124847412, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0453, + "step": 3060 + }, + { + "epoch": 0.0877769835596855, + "grad_norm": 1.552089810371399, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0577, + "step": 3070 + }, + { + "epoch": 0.08806290207290922, + "grad_norm": 0.7555598616600037, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.058, + "step": 3080 + }, + { + "epoch": 0.08834882058613296, + "grad_norm": 0.7763100266456604, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.074, + "step": 3090 + }, + { + "epoch": 0.08863473909935668, + "grad_norm": 0.5088932514190674, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.07, + "step": 3100 + }, + { + "epoch": 0.08892065761258042, + "grad_norm": 0.517383873462677, + "learning_rate": 1.976612732743278e-05, + "loss": 0.0497, + "step": 3110 + }, + { + "epoch": 0.08920657612580414, + "grad_norm": 0.9673930406570435, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.061, + "step": 3120 + }, + { + "epoch": 0.08949249463902788, + "grad_norm": 1.1182832717895508, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0503, + "step": 3130 + }, + { + "epoch": 0.0897784131522516, + "grad_norm": 0.8064592480659485, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0527, + "step": 3140 + }, + { + "epoch": 0.09006433166547534, + "grad_norm": 1.3616310358047485, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0491, + "step": 3150 + }, + { + "epoch": 0.09035025017869908, + "grad_norm": 0.6205968856811523, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0492, + "step": 3160 + }, + { + "epoch": 0.0906361686919228, + "grad_norm": 0.9427729249000549, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.054, + "step": 3170 + }, + { + "epoch": 0.09092208720514654, + "grad_norm": 0.6940050721168518, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0622, + "step": 3180 + }, + { + "epoch": 0.09120800571837026, + "grad_norm": 0.7082361578941345, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0474, + "step": 3190 + }, + { + "epoch": 0.091493924231594, + "grad_norm": 0.4606474041938782, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.059, + "step": 3200 + }, + { + "epoch": 0.09177984274481772, + "grad_norm": 0.46445760130882263, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0483, + "step": 3210 + }, + { + "epoch": 0.09206576125804146, + "grad_norm": 0.7431371212005615, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.046, + "step": 3220 + }, + { + "epoch": 0.09235167977126518, + "grad_norm": 0.8430010676383972, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0665, + "step": 3230 + }, + { + "epoch": 0.09263759828448892, + "grad_norm": 0.9888875484466553, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0676, + "step": 3240 + }, + { + "epoch": 0.09292351679771266, + "grad_norm": 0.792150616645813, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0636, + "step": 3250 + }, + { + "epoch": 0.09320943531093638, + "grad_norm": 0.859030544757843, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.0634, + "step": 3260 + }, + { + "epoch": 0.09349535382416012, + "grad_norm": 0.7612795233726501, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0658, + "step": 3270 + }, + { + "epoch": 0.09378127233738384, + "grad_norm": 0.5470104217529297, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0514, + "step": 3280 + }, + { + "epoch": 0.09406719085060758, + "grad_norm": 0.6354894042015076, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.0489, + "step": 3290 + }, + { + "epoch": 0.0943531093638313, + "grad_norm": 1.3852356672286987, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0694, + "step": 3300 + }, + { + "epoch": 0.09463902787705504, + "grad_norm": 0.5610274076461792, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0629, + "step": 3310 + }, + { + "epoch": 0.09492494639027876, + "grad_norm": 1.2192410230636597, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0525, + "step": 3320 + }, + { + "epoch": 0.0952108649035025, + "grad_norm": 1.06831955909729, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.051, + "step": 3330 + }, + { + "epoch": 0.09549678341672624, + "grad_norm": 0.32288479804992676, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0503, + "step": 3340 + }, + { + "epoch": 0.09578270192994996, + "grad_norm": 0.5871645212173462, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0564, + "step": 3350 + }, + { + "epoch": 0.0960686204431737, + "grad_norm": 0.6069591045379639, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0495, + "step": 3360 + }, + { + "epoch": 0.09635453895639742, + "grad_norm": 1.0015379190444946, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0624, + "step": 3370 + }, + { + "epoch": 0.09664045746962116, + "grad_norm": 0.7534980773925781, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.0618, + "step": 3380 + }, + { + "epoch": 0.09692637598284488, + "grad_norm": 0.45888280868530273, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0445, + "step": 3390 + }, + { + "epoch": 0.09721229449606862, + "grad_norm": 0.7550806403160095, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0461, + "step": 3400 + }, + { + "epoch": 0.09749821300929234, + "grad_norm": 0.4738181531429291, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.052, + "step": 3410 + }, + { + "epoch": 0.09778413152251608, + "grad_norm": 0.6711190938949585, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0476, + "step": 3420 + }, + { + "epoch": 0.09807005003573982, + "grad_norm": 0.4751316010951996, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0507, + "step": 3430 + }, + { + "epoch": 0.09835596854896354, + "grad_norm": 0.83565753698349, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0492, + "step": 3440 + }, + { + "epoch": 0.09864188706218728, + "grad_norm": 0.5360665321350098, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0627, + "step": 3450 + }, + { + "epoch": 0.098927805575411, + "grad_norm": 0.7463604211807251, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0496, + "step": 3460 + }, + { + "epoch": 0.09921372408863474, + "grad_norm": 0.7294344305992126, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0697, + "step": 3470 + }, + { + "epoch": 0.09949964260185847, + "grad_norm": 0.5676283836364746, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.0541, + "step": 3480 + }, + { + "epoch": 0.0997855611150822, + "grad_norm": 0.5879732370376587, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.0442, + "step": 3490 + }, + { + "epoch": 0.10007147962830593, + "grad_norm": 0.832818865776062, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0505, + "step": 3500 + }, + { + "epoch": 0.10035739814152966, + "grad_norm": 0.48553410172462463, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0573, + "step": 3510 + }, + { + "epoch": 0.1006433166547534, + "grad_norm": 0.9502766728401184, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0449, + "step": 3520 + }, + { + "epoch": 0.10092923516797712, + "grad_norm": 0.7497885227203369, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0737, + "step": 3530 + }, + { + "epoch": 0.10121515368120086, + "grad_norm": 0.5581928491592407, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0514, + "step": 3540 + }, + { + "epoch": 0.10150107219442459, + "grad_norm": 1.140236258506775, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0524, + "step": 3550 + }, + { + "epoch": 0.10178699070764832, + "grad_norm": 0.8161870241165161, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0721, + "step": 3560 + }, + { + "epoch": 0.10207290922087205, + "grad_norm": 0.8796533942222595, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0564, + "step": 3570 + }, + { + "epoch": 0.10235882773409578, + "grad_norm": 1.4811128377914429, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.063, + "step": 3580 + }, + { + "epoch": 0.10264474624731951, + "grad_norm": 0.8029062747955322, + "learning_rate": 1.964833301001045e-05, + "loss": 0.0589, + "step": 3590 + }, + { + "epoch": 0.10293066476054324, + "grad_norm": 0.7806634902954102, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0617, + "step": 3600 + }, + { + "epoch": 0.10321658327376698, + "grad_norm": 1.1286838054656982, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0574, + "step": 3610 + }, + { + "epoch": 0.1035025017869907, + "grad_norm": 0.374104768037796, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.047, + "step": 3620 + }, + { + "epoch": 0.10378842030021444, + "grad_norm": 1.1743136644363403, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0676, + "step": 3630 + }, + { + "epoch": 0.10407433881343817, + "grad_norm": 0.7684413194656372, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0663, + "step": 3640 + }, + { + "epoch": 0.1043602573266619, + "grad_norm": 1.0642409324645996, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.051, + "step": 3650 + }, + { + "epoch": 0.10464617583988563, + "grad_norm": 0.7752460837364197, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0543, + "step": 3660 + }, + { + "epoch": 0.10493209435310936, + "grad_norm": 0.9053257703781128, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.051, + "step": 3670 + }, + { + "epoch": 0.10521801286633309, + "grad_norm": 0.7407983541488647, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0436, + "step": 3680 + }, + { + "epoch": 0.10550393137955683, + "grad_norm": 1.3622519969940186, + "learning_rate": 1.962083815106258e-05, + "loss": 0.0468, + "step": 3690 + }, + { + "epoch": 0.10578984989278056, + "grad_norm": 1.2751463651657104, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0571, + "step": 3700 + }, + { + "epoch": 0.10607576840600429, + "grad_norm": 0.5535411238670349, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0564, + "step": 3710 + }, + { + "epoch": 0.10636168691922802, + "grad_norm": 0.6728671193122864, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0487, + "step": 3720 + }, + { + "epoch": 0.10664760543245175, + "grad_norm": 0.82345050573349, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0656, + "step": 3730 + }, + { + "epoch": 0.10693352394567548, + "grad_norm": 0.6446594595909119, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0467, + "step": 3740 + }, + { + "epoch": 0.10721944245889921, + "grad_norm": 1.0836280584335327, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0536, + "step": 3750 + }, + { + "epoch": 0.10750536097212295, + "grad_norm": 0.3758300840854645, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0476, + "step": 3760 + }, + { + "epoch": 0.10779127948534667, + "grad_norm": 0.682266116142273, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0519, + "step": 3770 + }, + { + "epoch": 0.1080771979985704, + "grad_norm": 0.5025804042816162, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0474, + "step": 3780 + }, + { + "epoch": 0.10836311651179414, + "grad_norm": 1.019890308380127, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0492, + "step": 3790 + }, + { + "epoch": 0.10864903502501787, + "grad_norm": 0.7843710780143738, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0561, + "step": 3800 + }, + { + "epoch": 0.1089349535382416, + "grad_norm": 0.5028522610664368, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0582, + "step": 3810 + }, + { + "epoch": 0.10922087205146533, + "grad_norm": 0.6400144696235657, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0502, + "step": 3820 + }, + { + "epoch": 0.10950679056468907, + "grad_norm": 0.9432899355888367, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0471, + "step": 3830 + }, + { + "epoch": 0.10979270907791279, + "grad_norm": 0.7582482695579529, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.052, + "step": 3840 + }, + { + "epoch": 0.11007862759113653, + "grad_norm": 0.34035608172416687, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0464, + "step": 3850 + }, + { + "epoch": 0.11036454610436025, + "grad_norm": 1.3330878019332886, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0578, + "step": 3860 + }, + { + "epoch": 0.11065046461758399, + "grad_norm": 0.7309219837188721, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.052, + "step": 3870 + }, + { + "epoch": 0.11093638313080773, + "grad_norm": 0.6248922944068909, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0815, + "step": 3880 + }, + { + "epoch": 0.11122230164403145, + "grad_norm": 0.8298835158348083, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0491, + "step": 3890 + }, + { + "epoch": 0.11150822015725519, + "grad_norm": 0.6728928685188293, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.0506, + "step": 3900 + }, + { + "epoch": 0.11179413867047891, + "grad_norm": 0.8456764817237854, + "learning_rate": 1.95567930185928e-05, + "loss": 0.051, + "step": 3910 + }, + { + "epoch": 0.11208005718370265, + "grad_norm": 0.9024212956428528, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0514, + "step": 3920 + }, + { + "epoch": 0.11236597569692637, + "grad_norm": 0.4843275845050812, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.056, + "step": 3930 + }, + { + "epoch": 0.11265189421015011, + "grad_norm": 0.5677530765533447, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0548, + "step": 3940 + }, + { + "epoch": 0.11293781272337383, + "grad_norm": 1.0913296937942505, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0697, + "step": 3950 + }, + { + "epoch": 0.11322373123659757, + "grad_norm": 0.6271129250526428, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0655, + "step": 3960 + }, + { + "epoch": 0.1135096497498213, + "grad_norm": 0.9063813090324402, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0469, + "step": 3970 + }, + { + "epoch": 0.11379556826304503, + "grad_norm": 0.7493836283683777, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0482, + "step": 3980 + }, + { + "epoch": 0.11408148677626877, + "grad_norm": 0.8022870421409607, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0655, + "step": 3990 + }, + { + "epoch": 0.11436740528949249, + "grad_norm": 0.6266750693321228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0542, + "step": 4000 + }, + { + "epoch": 0.11465332380271623, + "grad_norm": 0.45027732849121094, + "learning_rate": 1.95260726824789e-05, + "loss": 0.058, + "step": 4010 + }, + { + "epoch": 0.11493924231593995, + "grad_norm": 0.950760543346405, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.0552, + "step": 4020 + }, + { + "epoch": 0.11522516082916369, + "grad_norm": 0.6397078037261963, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.0544, + "step": 4030 + }, + { + "epoch": 0.11551107934238741, + "grad_norm": 0.7060579657554626, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0565, + "step": 4040 + }, + { + "epoch": 0.11579699785561115, + "grad_norm": 0.7861781716346741, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0511, + "step": 4050 + }, + { + "epoch": 0.11608291636883489, + "grad_norm": 0.5479229688644409, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0509, + "step": 4060 + }, + { + "epoch": 0.11636883488205861, + "grad_norm": 0.3854960501194, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0623, + "step": 4070 + }, + { + "epoch": 0.11665475339528235, + "grad_norm": 1.9533435106277466, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0643, + "step": 4080 + }, + { + "epoch": 0.11694067190850607, + "grad_norm": 0.5853668451309204, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0453, + "step": 4090 + }, + { + "epoch": 0.11722659042172981, + "grad_norm": 0.6850668787956238, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0555, + "step": 4100 + }, + { + "epoch": 0.11751250893495353, + "grad_norm": 1.1605839729309082, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0586, + "step": 4110 + }, + { + "epoch": 0.11779842744817727, + "grad_norm": 0.7753151059150696, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0623, + "step": 4120 + }, + { + "epoch": 0.118084345961401, + "grad_norm": 0.7955726385116577, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0467, + "step": 4130 + }, + { + "epoch": 0.11837026447462473, + "grad_norm": 0.7632233500480652, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0523, + "step": 4140 + }, + { + "epoch": 0.11865618298784847, + "grad_norm": 0.5821241140365601, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0485, + "step": 4150 + }, + { + "epoch": 0.11894210150107219, + "grad_norm": 0.4795539379119873, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0559, + "step": 4160 + }, + { + "epoch": 0.11922802001429593, + "grad_norm": 0.6324377655982971, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0473, + "step": 4170 + }, + { + "epoch": 0.11951393852751965, + "grad_norm": 0.8578745722770691, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.0474, + "step": 4180 + }, + { + "epoch": 0.11979985704074339, + "grad_norm": 0.5988736748695374, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 0.12008577555396711, + "grad_norm": 0.8098701238632202, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0511, + "step": 4200 + }, + { + "epoch": 0.12037169406719085, + "grad_norm": 1.2059956789016724, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0501, + "step": 4210 + }, + { + "epoch": 0.12065761258041457, + "grad_norm": 0.7477571368217468, + "learning_rate": 1.945830755977688e-05, + "loss": 0.0565, + "step": 4220 + }, + { + "epoch": 0.12094353109363831, + "grad_norm": 0.467942476272583, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0601, + "step": 4230 + }, + { + "epoch": 0.12122944960686205, + "grad_norm": 0.5761682391166687, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.052, + "step": 4240 + }, + { + "epoch": 0.12151536812008577, + "grad_norm": 0.8247032761573792, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0503, + "step": 4250 + }, + { + "epoch": 0.12180128663330951, + "grad_norm": 0.5218040347099304, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0434, + "step": 4260 + }, + { + "epoch": 0.12208720514653323, + "grad_norm": 0.5024936199188232, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0474, + "step": 4270 + }, + { + "epoch": 0.12237312365975697, + "grad_norm": 0.5558021664619446, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0493, + "step": 4280 + }, + { + "epoch": 0.1226590421729807, + "grad_norm": 0.6252139210700989, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0467, + "step": 4290 + }, + { + "epoch": 0.12294496068620443, + "grad_norm": 0.6613588929176331, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0572, + "step": 4300 + }, + { + "epoch": 0.12323087919942816, + "grad_norm": 0.8098927736282349, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0551, + "step": 4310 + }, + { + "epoch": 0.1235167977126519, + "grad_norm": 0.8598331809043884, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0517, + "step": 4320 + }, + { + "epoch": 0.12380271622587563, + "grad_norm": 1.2555822134017944, + "learning_rate": 1.942106227801521e-05, + "loss": 0.0499, + "step": 4330 + }, + { + "epoch": 0.12408863473909935, + "grad_norm": 0.5311633348464966, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0467, + "step": 4340 + }, + { + "epoch": 0.12437455325232309, + "grad_norm": 0.5674521327018738, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0564, + "step": 4350 + }, + { + "epoch": 0.12466047176554682, + "grad_norm": 0.5226582884788513, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0503, + "step": 4360 + }, + { + "epoch": 0.12494639027877055, + "grad_norm": 0.8510275483131409, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0626, + "step": 4370 + }, + { + "epoch": 0.1252323087919943, + "grad_norm": 1.6184005737304688, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0565, + "step": 4380 + }, + { + "epoch": 0.125518227305218, + "grad_norm": 0.7836401462554932, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0567, + "step": 4390 + }, + { + "epoch": 0.12580414581844174, + "grad_norm": 0.686989963054657, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0727, + "step": 4400 + }, + { + "epoch": 0.12609006433166547, + "grad_norm": 0.6000984907150269, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0459, + "step": 4410 + }, + { + "epoch": 0.1263759828448892, + "grad_norm": 0.8751336932182312, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0509, + "step": 4420 + }, + { + "epoch": 0.12666190135811295, + "grad_norm": 0.9281551837921143, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0536, + "step": 4430 + }, + { + "epoch": 0.12694781987133666, + "grad_norm": 0.5268979668617249, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0487, + "step": 4440 + }, + { + "epoch": 0.1272337383845604, + "grad_norm": 0.9246962070465088, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0525, + "step": 4450 + }, + { + "epoch": 0.12751965689778413, + "grad_norm": 1.2159569263458252, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0559, + "step": 4460 + }, + { + "epoch": 0.12780557541100787, + "grad_norm": 1.1705470085144043, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0473, + "step": 4470 + }, + { + "epoch": 0.12809149392423158, + "grad_norm": 0.4624033570289612, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0385, + "step": 4480 + }, + { + "epoch": 0.12837741243745532, + "grad_norm": 0.68497633934021, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.055, + "step": 4490 + }, + { + "epoch": 0.12866333095067906, + "grad_norm": 0.6132450699806213, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0512, + "step": 4500 + }, + { + "epoch": 0.1289492494639028, + "grad_norm": 0.7438398003578186, + "learning_rate": 1.935753861926916e-05, + "loss": 0.057, + "step": 4510 + }, + { + "epoch": 0.12923516797712653, + "grad_norm": 1.01064133644104, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0542, + "step": 4520 + }, + { + "epoch": 0.12952108649035024, + "grad_norm": 0.7620115280151367, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0511, + "step": 4530 + }, + { + "epoch": 0.12980700500357398, + "grad_norm": 0.8325042128562927, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0434, + "step": 4540 + }, + { + "epoch": 0.13009292351679771, + "grad_norm": 1.333525538444519, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0527, + "step": 4550 + }, + { + "epoch": 0.13037884203002145, + "grad_norm": 0.5498093962669373, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0455, + "step": 4560 + }, + { + "epoch": 0.13066476054324516, + "grad_norm": 0.8072503209114075, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0471, + "step": 4570 + }, + { + "epoch": 0.1309506790564689, + "grad_norm": 0.7596970200538635, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0476, + "step": 4580 + }, + { + "epoch": 0.13123659756969264, + "grad_norm": 0.5895066857337952, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.058, + "step": 4590 + }, + { + "epoch": 0.13152251608291637, + "grad_norm": 0.7977209687232971, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0453, + "step": 4600 + }, + { + "epoch": 0.1318084345961401, + "grad_norm": 0.6070771813392639, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0626, + "step": 4610 + }, + { + "epoch": 0.13209435310936382, + "grad_norm": 0.776318371295929, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.0502, + "step": 4620 + }, + { + "epoch": 0.13238027162258756, + "grad_norm": 0.7913787961006165, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0495, + "step": 4630 + }, + { + "epoch": 0.1326661901358113, + "grad_norm": 0.7327920794487, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0537, + "step": 4640 + }, + { + "epoch": 0.13295210864903503, + "grad_norm": 1.2004122734069824, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.0479, + "step": 4650 + }, + { + "epoch": 0.13323802716225874, + "grad_norm": 0.663301408290863, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0426, + "step": 4660 + }, + { + "epoch": 0.13352394567548248, + "grad_norm": 0.7744486331939697, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0538, + "step": 4670 + }, + { + "epoch": 0.13380986418870622, + "grad_norm": 0.6179795265197754, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0562, + "step": 4680 + }, + { + "epoch": 0.13409578270192996, + "grad_norm": 0.6461634635925293, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0458, + "step": 4690 + }, + { + "epoch": 0.1343817012151537, + "grad_norm": 0.6578474640846252, + "learning_rate": 1.928703895604588e-05, + "loss": 0.064, + "step": 4700 + }, + { + "epoch": 0.1346676197283774, + "grad_norm": 0.8851020336151123, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0632, + "step": 4710 + }, + { + "epoch": 0.13495353824160114, + "grad_norm": 0.4704781472682953, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0431, + "step": 4720 + }, + { + "epoch": 0.13523945675482488, + "grad_norm": 0.9809741377830505, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.059, + "step": 4730 + }, + { + "epoch": 0.13552537526804861, + "grad_norm": 0.9307458400726318, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0528, + "step": 4740 + }, + { + "epoch": 0.13581129378127232, + "grad_norm": 0.8084405660629272, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0603, + "step": 4750 + }, + { + "epoch": 0.13609721229449606, + "grad_norm": 0.6919799447059631, + "learning_rate": 1.926404507646751e-05, + "loss": 0.0589, + "step": 4760 + }, + { + "epoch": 0.1363831308077198, + "grad_norm": 0.8543849587440491, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0508, + "step": 4770 + }, + { + "epoch": 0.13666904932094354, + "grad_norm": 0.6308473348617554, + "learning_rate": 1.925630281527157e-05, + "loss": 0.0485, + "step": 4780 + }, + { + "epoch": 0.13695496783416727, + "grad_norm": 0.739931046962738, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0534, + "step": 4790 + }, + { + "epoch": 0.13724088634739098, + "grad_norm": 0.7895604372024536, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0449, + "step": 4800 + }, + { + "epoch": 0.13752680486061472, + "grad_norm": 0.7639005780220032, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0496, + "step": 4810 + }, + { + "epoch": 0.13781272337383846, + "grad_norm": 0.5999978184700012, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.052, + "step": 4820 + }, + { + "epoch": 0.1380986418870622, + "grad_norm": 0.8037213087081909, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0667, + "step": 4830 + }, + { + "epoch": 0.1383845604002859, + "grad_norm": 0.7414689064025879, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.0509, + "step": 4840 + }, + { + "epoch": 0.13867047891350964, + "grad_norm": 0.6627739667892456, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0515, + "step": 4850 + }, + { + "epoch": 0.13895639742673338, + "grad_norm": 0.6969587802886963, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0626, + "step": 4860 + }, + { + "epoch": 0.13924231593995712, + "grad_norm": 0.7554855942726135, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0478, + "step": 4870 + }, + { + "epoch": 0.13952823445318085, + "grad_norm": 0.5623564124107361, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.044, + "step": 4880 + }, + { + "epoch": 0.13981415296640456, + "grad_norm": 0.6897832751274109, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0466, + "step": 4890 + }, + { + "epoch": 0.1401000714796283, + "grad_norm": 0.5474520921707153, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0514, + "step": 4900 + }, + { + "epoch": 0.14038598999285204, + "grad_norm": 0.9736361503601074, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0472, + "step": 4910 + }, + { + "epoch": 0.14067190850607578, + "grad_norm": 0.5566041469573975, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0533, + "step": 4920 + }, + { + "epoch": 0.1409578270192995, + "grad_norm": 1.0295166969299316, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.0478, + "step": 4930 + }, + { + "epoch": 0.14124374553252322, + "grad_norm": 1.0931389331817627, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0652, + "step": 4940 + }, + { + "epoch": 0.14152966404574696, + "grad_norm": 1.3054399490356445, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0564, + "step": 4950 + }, + { + "epoch": 0.1418155825589707, + "grad_norm": 0.45592883229255676, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0456, + "step": 4960 + }, + { + "epoch": 0.14210150107219444, + "grad_norm": 0.6758268475532532, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0543, + "step": 4970 + }, + { + "epoch": 0.14238741958541815, + "grad_norm": 0.9643615484237671, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0532, + "step": 4980 + }, + { + "epoch": 0.14267333809864188, + "grad_norm": 0.565969705581665, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0458, + "step": 4990 + }, + { + "epoch": 0.14295925661186562, + "grad_norm": 0.8053064346313477, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0558, + "step": 5000 + }, + { + "epoch": 0.14324517512508936, + "grad_norm": 0.606215238571167, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0562, + "step": 5010 + }, + { + "epoch": 0.14353109363831307, + "grad_norm": 0.5565656423568726, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0543, + "step": 5020 + }, + { + "epoch": 0.1438170121515368, + "grad_norm": 0.353696346282959, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0451, + "step": 5030 + }, + { + "epoch": 0.14410293066476054, + "grad_norm": 0.6627641916275024, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.0607, + "step": 5040 + }, + { + "epoch": 0.14438884917798428, + "grad_norm": 0.7896742224693298, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0496, + "step": 5050 + }, + { + "epoch": 0.14467476769120802, + "grad_norm": 0.7444631457328796, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0641, + "step": 5060 + }, + { + "epoch": 0.14496068620443173, + "grad_norm": 0.7871376872062683, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0464, + "step": 5070 + }, + { + "epoch": 0.14524660471765546, + "grad_norm": 0.7784642577171326, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0466, + "step": 5080 + }, + { + "epoch": 0.1455325232308792, + "grad_norm": 0.6950685381889343, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.0457, + "step": 5090 + }, + { + "epoch": 0.14581844174410294, + "grad_norm": 1.0631619691848755, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0513, + "step": 5100 + }, + { + "epoch": 0.14610436025732665, + "grad_norm": 0.4327051639556885, + "learning_rate": 1.912298771234382e-05, + "loss": 0.0599, + "step": 5110 + }, + { + "epoch": 0.14639027877055039, + "grad_norm": 0.7790032029151917, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0617, + "step": 5120 + }, + { + "epoch": 0.14667619728377412, + "grad_norm": 0.42061591148376465, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.14696211579699786, + "grad_norm": 1.4090712070465088, + "learning_rate": 1.911035077753307e-05, + "loss": 0.0564, + "step": 5140 + }, + { + "epoch": 0.1472480343102216, + "grad_norm": 0.540844738483429, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0505, + "step": 5150 + }, + { + "epoch": 0.1475339528234453, + "grad_norm": 0.5608566999435425, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0583, + "step": 5160 + }, + { + "epoch": 0.14781987133666905, + "grad_norm": 0.750708818435669, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.0467, + "step": 5170 + }, + { + "epoch": 0.14810578984989278, + "grad_norm": 0.608989953994751, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0507, + "step": 5180 + }, + { + "epoch": 0.14839170836311652, + "grad_norm": 0.8176707029342651, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0455, + "step": 5190 + }, + { + "epoch": 0.14867762687634023, + "grad_norm": 0.5280511379241943, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0586, + "step": 5200 + }, + { + "epoch": 0.14896354538956397, + "grad_norm": 0.5914652347564697, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.042, + "step": 5210 + }, + { + "epoch": 0.1492494639027877, + "grad_norm": 0.4816238582134247, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0431, + "step": 5220 + }, + { + "epoch": 0.14953538241601144, + "grad_norm": 0.5413132309913635, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0453, + "step": 5230 + }, + { + "epoch": 0.14982130092923518, + "grad_norm": 0.749200701713562, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0505, + "step": 5240 + }, + { + "epoch": 0.1501072194424589, + "grad_norm": 0.8051598072052002, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0513, + "step": 5250 + }, + { + "epoch": 0.15039313795568263, + "grad_norm": 0.5365609526634216, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0383, + "step": 5260 + }, + { + "epoch": 0.15067905646890636, + "grad_norm": 0.5546812415122986, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0438, + "step": 5270 + }, + { + "epoch": 0.1509649749821301, + "grad_norm": 0.6248345375061035, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.045, + "step": 5280 + }, + { + "epoch": 0.1512508934953538, + "grad_norm": 0.42673179507255554, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0435, + "step": 5290 + }, + { + "epoch": 0.15153681200857755, + "grad_norm": 0.6677115559577942, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0541, + "step": 5300 + }, + { + "epoch": 0.15182273052180129, + "grad_norm": 0.4739227294921875, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0516, + "step": 5310 + }, + { + "epoch": 0.15210864903502502, + "grad_norm": 0.7931821346282959, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0566, + "step": 5320 + }, + { + "epoch": 0.15239456754824876, + "grad_norm": 0.6296460032463074, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0496, + "step": 5330 + }, + { + "epoch": 0.15268048606147247, + "grad_norm": 0.6713911890983582, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0462, + "step": 5340 + }, + { + "epoch": 0.1529664045746962, + "grad_norm": 1.088040828704834, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0663, + "step": 5350 + }, + { + "epoch": 0.15325232308791994, + "grad_norm": 1.4942265748977661, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0541, + "step": 5360 + }, + { + "epoch": 0.15353824160114368, + "grad_norm": 1.5721286535263062, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.0546, + "step": 5370 + }, + { + "epoch": 0.1538241601143674, + "grad_norm": 0.9329798221588135, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0538, + "step": 5380 + }, + { + "epoch": 0.15411007862759113, + "grad_norm": 0.5658103823661804, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0561, + "step": 5390 + }, + { + "epoch": 0.15439599714081487, + "grad_norm": 0.6210218071937561, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.054, + "step": 5400 + }, + { + "epoch": 0.1546819156540386, + "grad_norm": 0.7934702634811401, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0506, + "step": 5410 + }, + { + "epoch": 0.15496783416726234, + "grad_norm": 1.0321810245513916, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0483, + "step": 5420 + }, + { + "epoch": 0.15525375268048605, + "grad_norm": 0.6226248145103455, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0464, + "step": 5430 + }, + { + "epoch": 0.1555396711937098, + "grad_norm": 0.6217877864837646, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0466, + "step": 5440 + }, + { + "epoch": 0.15582558970693353, + "grad_norm": 0.44068101048469543, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0517, + "step": 5450 + }, + { + "epoch": 0.15611150822015726, + "grad_norm": 0.4715922772884369, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0391, + "step": 5460 + }, + { + "epoch": 0.15639742673338097, + "grad_norm": 0.6649858951568604, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0524, + "step": 5470 + }, + { + "epoch": 0.1566833452466047, + "grad_norm": 0.5635918974876404, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.054, + "step": 5480 + }, + { + "epoch": 0.15696926375982845, + "grad_norm": 0.5584990978240967, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0559, + "step": 5490 + }, + { + "epoch": 0.15725518227305219, + "grad_norm": 0.7777124047279358, + "learning_rate": 1.895206504082557e-05, + "loss": 0.052, + "step": 5500 + }, + { + "epoch": 0.15754110078627592, + "grad_norm": 0.7057285308837891, + "learning_rate": 1.894749443411004e-05, + "loss": 0.0507, + "step": 5510 + }, + { + "epoch": 0.15782701929949963, + "grad_norm": 0.4290146827697754, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0508, + "step": 5520 + }, + { + "epoch": 0.15811293781272337, + "grad_norm": 0.7333746552467346, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0609, + "step": 5530 + }, + { + "epoch": 0.1583988563259471, + "grad_norm": 0.6905514001846313, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.0441, + "step": 5540 + }, + { + "epoch": 0.15868477483917084, + "grad_norm": 0.4859441816806793, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0586, + "step": 5550 + }, + { + "epoch": 0.15897069335239455, + "grad_norm": 0.4259501099586487, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0446, + "step": 5560 + }, + { + "epoch": 0.1592566118656183, + "grad_norm": 0.7659216523170471, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0486, + "step": 5570 + }, + { + "epoch": 0.15954253037884203, + "grad_norm": 0.6377918124198914, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0497, + "step": 5580 + }, + { + "epoch": 0.15982844889206577, + "grad_norm": 0.9122095704078674, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0497, + "step": 5590 + }, + { + "epoch": 0.1601143674052895, + "grad_norm": 0.5986319780349731, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0789, + "step": 5600 + }, + { + "epoch": 0.1604002859185132, + "grad_norm": 0.6486982107162476, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0481, + "step": 5610 + }, + { + "epoch": 0.16068620443173695, + "grad_norm": 0.9778286814689636, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0609, + "step": 5620 + }, + { + "epoch": 0.1609721229449607, + "grad_norm": 0.9133608341217041, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0499, + "step": 5630 + }, + { + "epoch": 0.16125804145818443, + "grad_norm": 0.8979085087776184, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0539, + "step": 5640 + }, + { + "epoch": 0.16154395997140814, + "grad_norm": 0.7787102460861206, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0593, + "step": 5650 + }, + { + "epoch": 0.16182987848463187, + "grad_norm": 0.8269296884536743, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0466, + "step": 5660 + }, + { + "epoch": 0.1621157969978556, + "grad_norm": 1.0018537044525146, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0542, + "step": 5670 + }, + { + "epoch": 0.16240171551107935, + "grad_norm": 0.6690066456794739, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0504, + "step": 5680 + }, + { + "epoch": 0.16268763402430308, + "grad_norm": 0.8186119198799133, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0523, + "step": 5690 + }, + { + "epoch": 0.1629735525375268, + "grad_norm": 0.6039218902587891, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.053, + "step": 5700 + }, + { + "epoch": 0.16325947105075053, + "grad_norm": 0.5570294857025146, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0627, + "step": 5710 + }, + { + "epoch": 0.16354538956397427, + "grad_norm": 0.6330029368400574, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.043, + "step": 5720 + }, + { + "epoch": 0.163831308077198, + "grad_norm": 0.42857953906059265, + "learning_rate": 1.884459101447439e-05, + "loss": 0.043, + "step": 5730 + }, + { + "epoch": 0.16411722659042172, + "grad_norm": 0.6611765027046204, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0478, + "step": 5740 + }, + { + "epoch": 0.16440314510364545, + "grad_norm": 0.5025321841239929, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0563, + "step": 5750 + }, + { + "epoch": 0.1646890636168692, + "grad_norm": 0.468772292137146, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0579, + "step": 5760 + }, + { + "epoch": 0.16497498213009293, + "grad_norm": 0.8914149403572083, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0451, + "step": 5770 + }, + { + "epoch": 0.16526090064331667, + "grad_norm": 0.7421362996101379, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0446, + "step": 5780 + }, + { + "epoch": 0.16554681915654038, + "grad_norm": 0.6159907579421997, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0486, + "step": 5790 + }, + { + "epoch": 0.1658327376697641, + "grad_norm": 0.7762402892112732, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0528, + "step": 5800 + }, + { + "epoch": 0.16611865618298785, + "grad_norm": 0.688562273979187, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0562, + "step": 5810 + }, + { + "epoch": 0.1664045746962116, + "grad_norm": 0.6233720183372498, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0454, + "step": 5820 + }, + { + "epoch": 0.1666904932094353, + "grad_norm": 1.0762931108474731, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0586, + "step": 5830 + }, + { + "epoch": 0.16697641172265903, + "grad_norm": 0.6782101988792419, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0486, + "step": 5840 + }, + { + "epoch": 0.16726233023588277, + "grad_norm": 0.8854986429214478, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0668, + "step": 5850 + }, + { + "epoch": 0.1675482487491065, + "grad_norm": 0.6537308096885681, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0456, + "step": 5860 + }, + { + "epoch": 0.16783416726233025, + "grad_norm": 1.4588080644607544, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0531, + "step": 5870 + }, + { + "epoch": 0.16812008577555396, + "grad_norm": 0.4888838529586792, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0608, + "step": 5880 + }, + { + "epoch": 0.1684060042887777, + "grad_norm": 0.6046859622001648, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.0596, + "step": 5890 + }, + { + "epoch": 0.16869192280200143, + "grad_norm": 1.0373053550720215, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0672, + "step": 5900 + }, + { + "epoch": 0.16897784131522517, + "grad_norm": 0.7728743553161621, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0567, + "step": 5910 + }, + { + "epoch": 0.16926375982844888, + "grad_norm": 0.7804396152496338, + "learning_rate": 1.875213208215953e-05, + "loss": 0.0443, + "step": 5920 + }, + { + "epoch": 0.16954967834167262, + "grad_norm": 0.5331568717956543, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0462, + "step": 5930 + }, + { + "epoch": 0.16983559685489635, + "grad_norm": 0.5623118877410889, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0549, + "step": 5940 + }, + { + "epoch": 0.1701215153681201, + "grad_norm": 0.5113009214401245, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0571, + "step": 5950 + }, + { + "epoch": 0.17040743388134383, + "grad_norm": 0.45996031165122986, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.0474, + "step": 5960 + }, + { + "epoch": 0.17069335239456754, + "grad_norm": 0.9673702716827393, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0496, + "step": 5970 + }, + { + "epoch": 0.17097927090779128, + "grad_norm": 0.6134442687034607, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0472, + "step": 5980 + }, + { + "epoch": 0.171265189421015, + "grad_norm": 0.5929660797119141, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0448, + "step": 5990 + }, + { + "epoch": 0.17155110793423875, + "grad_norm": 0.6973591446876526, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0561, + "step": 6000 + }, + { + "epoch": 0.17183702644746246, + "grad_norm": 0.6361686587333679, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0528, + "step": 6010 + }, + { + "epoch": 0.1721229449606862, + "grad_norm": 0.8463344573974609, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0505, + "step": 6020 + }, + { + "epoch": 0.17240886347390993, + "grad_norm": 0.7931243777275085, + "learning_rate": 1.869709961183946e-05, + "loss": 0.047, + "step": 6030 + }, + { + "epoch": 0.17269478198713367, + "grad_norm": 0.8827673196792603, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0449, + "step": 6040 + }, + { + "epoch": 0.1729807005003574, + "grad_norm": 0.624167263507843, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0432, + "step": 6050 + }, + { + "epoch": 0.17326661901358112, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0546, + "step": 6060 + }, + { + "epoch": 0.17355253752680486, + "grad_norm": 0.6836652755737305, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0463, + "step": 6070 + }, + { + "epoch": 0.1738384560400286, + "grad_norm": 0.5454772114753723, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0554, + "step": 6080 + }, + { + "epoch": 0.17412437455325233, + "grad_norm": 0.3758164048194885, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0437, + "step": 6090 + }, + { + "epoch": 0.17441029306647604, + "grad_norm": 0.4269026517868042, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0436, + "step": 6100 + }, + { + "epoch": 0.17469621157969978, + "grad_norm": 1.3504232168197632, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0563, + "step": 6110 + }, + { + "epoch": 0.17498213009292352, + "grad_norm": 0.6270191669464111, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0552, + "step": 6120 + }, + { + "epoch": 0.17526804860614725, + "grad_norm": 0.7632624506950378, + "learning_rate": 1.864612143364565e-05, + "loss": 0.042, + "step": 6130 + }, + { + "epoch": 0.175553967119371, + "grad_norm": 0.7420883774757385, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0472, + "step": 6140 + }, + { + "epoch": 0.1758398856325947, + "grad_norm": 0.38518550992012024, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0494, + "step": 6150 + }, + { + "epoch": 0.17612580414581844, + "grad_norm": 0.4203122556209564, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.049, + "step": 6160 + }, + { + "epoch": 0.17641172265904217, + "grad_norm": 0.843169093132019, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0528, + "step": 6170 + }, + { + "epoch": 0.1766976411722659, + "grad_norm": 0.5033279657363892, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0441, + "step": 6180 + }, + { + "epoch": 0.17698355968548962, + "grad_norm": 0.9894040822982788, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0494, + "step": 6190 + }, + { + "epoch": 0.17726947819871336, + "grad_norm": 0.8269744515419006, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.0533, + "step": 6200 + }, + { + "epoch": 0.1775553967119371, + "grad_norm": 0.7923200726509094, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0518, + "step": 6210 + }, + { + "epoch": 0.17784131522516083, + "grad_norm": 0.580436646938324, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0445, + "step": 6220 + }, + { + "epoch": 0.17812723373838457, + "grad_norm": 1.0633399486541748, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0528, + "step": 6230 + }, + { + "epoch": 0.17841315225160828, + "grad_norm": 0.925599217414856, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0552, + "step": 6240 + }, + { + "epoch": 0.17869907076483202, + "grad_norm": 0.5874597430229187, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0473, + "step": 6250 + }, + { + "epoch": 0.17898498927805576, + "grad_norm": 0.9065818190574646, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0504, + "step": 6260 + }, + { + "epoch": 0.1792709077912795, + "grad_norm": 0.9060930609703064, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0578, + "step": 6270 + }, + { + "epoch": 0.1795568263045032, + "grad_norm": 0.6221855878829956, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0425, + "step": 6280 + }, + { + "epoch": 0.17984274481772694, + "grad_norm": 0.589621901512146, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0452, + "step": 6290 + }, + { + "epoch": 0.18012866333095068, + "grad_norm": 0.4308580756187439, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0528, + "step": 6300 + }, + { + "epoch": 0.18041458184417442, + "grad_norm": 0.34031248092651367, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0544, + "step": 6310 + }, + { + "epoch": 0.18070050035739815, + "grad_norm": 0.6438931226730347, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0411, + "step": 6320 + }, + { + "epoch": 0.18098641887062186, + "grad_norm": 0.5436957478523254, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0381, + "step": 6330 + }, + { + "epoch": 0.1812723373838456, + "grad_norm": 0.7326043248176575, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0486, + "step": 6340 + }, + { + "epoch": 0.18155825589706934, + "grad_norm": 0.9194608330726624, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0455, + "step": 6350 + }, + { + "epoch": 0.18184417441029307, + "grad_norm": 0.9366886019706726, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0529, + "step": 6360 + }, + { + "epoch": 0.18213009292351678, + "grad_norm": 0.3178311586380005, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0455, + "step": 6370 + }, + { + "epoch": 0.18241601143674052, + "grad_norm": 0.9811000823974609, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.054, + "step": 6380 + }, + { + "epoch": 0.18270192994996426, + "grad_norm": 0.4635869562625885, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0466, + "step": 6390 + }, + { + "epoch": 0.182987848463188, + "grad_norm": 0.6958444118499756, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0448, + "step": 6400 + }, + { + "epoch": 0.18327376697641173, + "grad_norm": 0.765814483165741, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0558, + "step": 6410 + }, + { + "epoch": 0.18355968548963544, + "grad_norm": 0.4117525815963745, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0484, + "step": 6420 + }, + { + "epoch": 0.18384560400285918, + "grad_norm": 0.6114997267723083, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0449, + "step": 6430 + }, + { + "epoch": 0.18413152251608292, + "grad_norm": 0.6006572842597961, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0442, + "step": 6440 + }, + { + "epoch": 0.18441744102930666, + "grad_norm": 0.5918669104576111, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0472, + "step": 6450 + }, + { + "epoch": 0.18470335954253037, + "grad_norm": 0.42107391357421875, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0471, + "step": 6460 + }, + { + "epoch": 0.1849892780557541, + "grad_norm": 0.5666350722312927, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0451, + "step": 6470 + }, + { + "epoch": 0.18527519656897784, + "grad_norm": 0.6074198484420776, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.051, + "step": 6480 + }, + { + "epoch": 0.18556111508220158, + "grad_norm": 0.771105945110321, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.0402, + "step": 6490 + }, + { + "epoch": 0.18584703359542531, + "grad_norm": 0.6381934881210327, + "learning_rate": 1.844974808419918e-05, + "loss": 0.049, + "step": 6500 + }, + { + "epoch": 0.18613295210864902, + "grad_norm": 0.4039069712162018, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0477, + "step": 6510 + }, + { + "epoch": 0.18641887062187276, + "grad_norm": 0.8936404585838318, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0515, + "step": 6520 + }, + { + "epoch": 0.1867047891350965, + "grad_norm": 0.5358276963233948, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0397, + "step": 6530 + }, + { + "epoch": 0.18699070764832024, + "grad_norm": 0.7260947823524475, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0501, + "step": 6540 + }, + { + "epoch": 0.18727662616154395, + "grad_norm": 0.6378960609436035, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0575, + "step": 6550 + }, + { + "epoch": 0.18756254467476768, + "grad_norm": 0.5879429578781128, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.041, + "step": 6560 + }, + { + "epoch": 0.18784846318799142, + "grad_norm": 0.846297025680542, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.0494, + "step": 6570 + }, + { + "epoch": 0.18813438170121516, + "grad_norm": 0.5211764574050903, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0463, + "step": 6580 + }, + { + "epoch": 0.1884203002144389, + "grad_norm": 0.8060504794120789, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0486, + "step": 6590 + }, + { + "epoch": 0.1887062187276626, + "grad_norm": 0.5741685628890991, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0435, + "step": 6600 + }, + { + "epoch": 0.18899213724088634, + "grad_norm": 0.6195408701896667, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0609, + "step": 6610 + }, + { + "epoch": 0.18927805575411008, + "grad_norm": 0.46843090653419495, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0477, + "step": 6620 + }, + { + "epoch": 0.18956397426733382, + "grad_norm": 0.5169982314109802, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0515, + "step": 6630 + }, + { + "epoch": 0.18984989278055753, + "grad_norm": 0.5571608543395996, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0492, + "step": 6640 + }, + { + "epoch": 0.19013581129378126, + "grad_norm": 0.7798209190368652, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0682, + "step": 6650 + }, + { + "epoch": 0.190421729807005, + "grad_norm": 0.6120383143424988, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0516, + "step": 6660 + }, + { + "epoch": 0.19070764832022874, + "grad_norm": 1.0191924571990967, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.049, + "step": 6670 + }, + { + "epoch": 0.19099356683345248, + "grad_norm": 0.5271646976470947, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0461, + "step": 6680 + }, + { + "epoch": 0.1912794853466762, + "grad_norm": 0.3315111994743347, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0467, + "step": 6690 + }, + { + "epoch": 0.19156540385989992, + "grad_norm": 0.7598944306373596, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.0576, + "step": 6700 + }, + { + "epoch": 0.19185132237312366, + "grad_norm": 0.8039186596870422, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0489, + "step": 6710 + }, + { + "epoch": 0.1921372408863474, + "grad_norm": 0.911704957485199, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0508, + "step": 6720 + }, + { + "epoch": 0.1924231593995711, + "grad_norm": 0.6092261672019958, + "learning_rate": 1.832162565208597e-05, + "loss": 0.0494, + "step": 6730 + }, + { + "epoch": 0.19270907791279485, + "grad_norm": 0.7890674471855164, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.049, + "step": 6740 + }, + { + "epoch": 0.19299499642601858, + "grad_norm": 0.8601320385932922, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0402, + "step": 6750 + }, + { + "epoch": 0.19328091493924232, + "grad_norm": 0.8750951290130615, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0517, + "step": 6760 + }, + { + "epoch": 0.19356683345246606, + "grad_norm": 0.7143217921257019, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0414, + "step": 6770 + }, + { + "epoch": 0.19385275196568977, + "grad_norm": 0.8340809345245361, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0531, + "step": 6780 + }, + { + "epoch": 0.1941386704789135, + "grad_norm": 0.4074079692363739, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0455, + "step": 6790 + }, + { + "epoch": 0.19442458899213724, + "grad_norm": 0.5369135737419128, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0472, + "step": 6800 + }, + { + "epoch": 0.19471050750536098, + "grad_norm": 0.44467195868492126, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0589, + "step": 6810 + }, + { + "epoch": 0.1949964260185847, + "grad_norm": 0.6032440662384033, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0479, + "step": 6820 + }, + { + "epoch": 0.19528234453180843, + "grad_norm": 0.4078349173069, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0402, + "step": 6830 + }, + { + "epoch": 0.19556826304503216, + "grad_norm": 0.49480268359184265, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0432, + "step": 6840 + }, + { + "epoch": 0.1958541815582559, + "grad_norm": 0.9844514727592468, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0479, + "step": 6850 + }, + { + "epoch": 0.19614010007147964, + "grad_norm": 1.1353951692581177, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0539, + "step": 6860 + }, + { + "epoch": 0.19642601858470335, + "grad_norm": 0.7535272836685181, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0572, + "step": 6870 + }, + { + "epoch": 0.1967119370979271, + "grad_norm": 0.4950162470340729, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0466, + "step": 6880 + }, + { + "epoch": 0.19699785561115082, + "grad_norm": 0.5310598015785217, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0487, + "step": 6890 + }, + { + "epoch": 0.19728377412437456, + "grad_norm": 0.9481188654899597, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0448, + "step": 6900 + }, + { + "epoch": 0.19756969263759827, + "grad_norm": 0.5303207039833069, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0419, + "step": 6910 + }, + { + "epoch": 0.197855611150822, + "grad_norm": 0.6180852055549622, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0401, + "step": 6920 + }, + { + "epoch": 0.19814152966404575, + "grad_norm": 0.5310384631156921, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0471, + "step": 6930 + }, + { + "epoch": 0.19842744817726948, + "grad_norm": 0.546660304069519, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0481, + "step": 6940 + }, + { + "epoch": 0.19871336669049322, + "grad_norm": 0.7824214696884155, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0423, + "step": 6950 + }, + { + "epoch": 0.19899928520371693, + "grad_norm": 0.9130761623382568, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0436, + "step": 6960 + }, + { + "epoch": 0.19928520371694067, + "grad_norm": 1.0512481927871704, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0497, + "step": 6970 + }, + { + "epoch": 0.1995711222301644, + "grad_norm": 0.8660218715667725, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0533, + "step": 6980 + }, + { + "epoch": 0.19985704074338814, + "grad_norm": 0.5280078649520874, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0455, + "step": 6990 + }, + { + "epoch": 0.20014295925661185, + "grad_norm": 0.6151753067970276, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0476, + "step": 7000 + }, + { + "epoch": 0.2004288777698356, + "grad_norm": 0.7165628671646118, + "learning_rate": 1.815952390818299e-05, + "loss": 0.051, + "step": 7010 + }, + { + "epoch": 0.20071479628305933, + "grad_norm": 0.6857513189315796, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0566, + "step": 7020 + }, + { + "epoch": 0.20100071479628306, + "grad_norm": 0.5589154958724976, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0741, + "step": 7030 + }, + { + "epoch": 0.2012866333095068, + "grad_norm": 0.6684713959693909, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.0461, + "step": 7040 + }, + { + "epoch": 0.2015725518227305, + "grad_norm": 0.41142046451568604, + "learning_rate": 1.813582526827608e-05, + "loss": 0.043, + "step": 7050 + }, + { + "epoch": 0.20185847033595425, + "grad_norm": 0.29734253883361816, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0464, + "step": 7060 + }, + { + "epoch": 0.20214438884917799, + "grad_norm": 0.3914707899093628, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.052, + "step": 7070 + }, + { + "epoch": 0.20243030736240172, + "grad_norm": 0.5075880885124207, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0401, + "step": 7080 + }, + { + "epoch": 0.20271622587562543, + "grad_norm": 0.6182138919830322, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0428, + "step": 7090 + }, + { + "epoch": 0.20300214438884917, + "grad_norm": 1.0438663959503174, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0476, + "step": 7100 + }, + { + "epoch": 0.2032880629020729, + "grad_norm": 0.4646940529346466, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0419, + "step": 7110 + }, + { + "epoch": 0.20357398141529665, + "grad_norm": 0.4236893951892853, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0539, + "step": 7120 + }, + { + "epoch": 0.20385989992852038, + "grad_norm": 0.7975651025772095, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0459, + "step": 7130 + }, + { + "epoch": 0.2041458184417441, + "grad_norm": 0.9628227949142456, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0568, + "step": 7140 + }, + { + "epoch": 0.20443173695496783, + "grad_norm": 0.8878718614578247, + "learning_rate": 1.807599344877606e-05, + "loss": 0.0528, + "step": 7150 + }, + { + "epoch": 0.20471765546819157, + "grad_norm": 0.5407359004020691, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0417, + "step": 7160 + }, + { + "epoch": 0.2050035739814153, + "grad_norm": 0.4407803416252136, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0435, + "step": 7170 + }, + { + "epoch": 0.20528949249463901, + "grad_norm": 0.4055456221103668, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0652, + "step": 7180 + }, + { + "epoch": 0.20557541100786275, + "grad_norm": 0.44706887006759644, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0476, + "step": 7190 + }, + { + "epoch": 0.2058613295210865, + "grad_norm": 1.2640881538391113, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0496, + "step": 7200 + }, + { + "epoch": 0.20614724803431023, + "grad_norm": 0.3773214817047119, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0455, + "step": 7210 + }, + { + "epoch": 0.20643316654753396, + "grad_norm": 0.6460191011428833, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0765, + "step": 7220 + }, + { + "epoch": 0.20671908506075767, + "grad_norm": 0.6048172116279602, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0506, + "step": 7230 + }, + { + "epoch": 0.2070050035739814, + "grad_norm": 0.38502392172813416, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0487, + "step": 7240 + }, + { + "epoch": 0.20729092208720515, + "grad_norm": 1.5727262496948242, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0538, + "step": 7250 + }, + { + "epoch": 0.20757684060042889, + "grad_norm": 0.3985368609428406, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0501, + "step": 7260 + }, + { + "epoch": 0.2078627591136526, + "grad_norm": 0.4519219994544983, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0542, + "step": 7270 + }, + { + "epoch": 0.20814867762687633, + "grad_norm": 0.6547327637672424, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0628, + "step": 7280 + }, + { + "epoch": 0.20843459614010007, + "grad_norm": 0.7864896655082703, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0521, + "step": 7290 + }, + { + "epoch": 0.2087205146533238, + "grad_norm": 0.6605416536331177, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0501, + "step": 7300 + }, + { + "epoch": 0.20900643316654754, + "grad_norm": 0.8260928988456726, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0495, + "step": 7310 + }, + { + "epoch": 0.20929235167977125, + "grad_norm": 0.7167025804519653, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0465, + "step": 7320 + }, + { + "epoch": 0.209578270192995, + "grad_norm": 0.6838316917419434, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0449, + "step": 7330 + }, + { + "epoch": 0.20986418870621873, + "grad_norm": 0.46520882844924927, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.0441, + "step": 7340 + }, + { + "epoch": 0.21015010721944247, + "grad_norm": 0.680860698223114, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0498, + "step": 7350 + }, + { + "epoch": 0.21043602573266618, + "grad_norm": 0.6697542071342468, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0361, + "step": 7360 + }, + { + "epoch": 0.21072194424588991, + "grad_norm": 0.9322425127029419, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0561, + "step": 7370 + }, + { + "epoch": 0.21100786275911365, + "grad_norm": 0.7454982399940491, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0464, + "step": 7380 + }, + { + "epoch": 0.2112937812723374, + "grad_norm": 0.5052962899208069, + "learning_rate": 1.792902262617481e-05, + "loss": 0.042, + "step": 7390 + }, + { + "epoch": 0.21157969978556113, + "grad_norm": 0.622719407081604, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0392, + "step": 7400 + }, + { + "epoch": 0.21186561829878484, + "grad_norm": 0.8296751976013184, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0512, + "step": 7410 + }, + { + "epoch": 0.21215153681200857, + "grad_norm": 0.7341750860214233, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0407, + "step": 7420 + }, + { + "epoch": 0.2124374553252323, + "grad_norm": 0.8206498026847839, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0491, + "step": 7430 + }, + { + "epoch": 0.21272337383845605, + "grad_norm": 0.5625871419906616, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0405, + "step": 7440 + }, + { + "epoch": 0.21300929235167976, + "grad_norm": 0.600284218788147, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0464, + "step": 7450 + }, + { + "epoch": 0.2132952108649035, + "grad_norm": 1.0839911699295044, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0481, + "step": 7460 + }, + { + "epoch": 0.21358112937812723, + "grad_norm": 0.45663371682167053, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0478, + "step": 7470 + }, + { + "epoch": 0.21386704789135097, + "grad_norm": 0.9196961522102356, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0566, + "step": 7480 + }, + { + "epoch": 0.2141529664045747, + "grad_norm": 0.5013288855552673, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0442, + "step": 7490 + }, + { + "epoch": 0.21443888491779842, + "grad_norm": 0.6444706916809082, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0484, + "step": 7500 + }, + { + "epoch": 0.21472480343102215, + "grad_norm": 0.5789361000061035, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0536, + "step": 7510 + }, + { + "epoch": 0.2150107219442459, + "grad_norm": 0.7474827170372009, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0526, + "step": 7520 + }, + { + "epoch": 0.21529664045746963, + "grad_norm": 0.7054215669631958, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0538, + "step": 7530 + }, + { + "epoch": 0.21558255897069334, + "grad_norm": 0.9778858423233032, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0533, + "step": 7540 + }, + { + "epoch": 0.21586847748391708, + "grad_norm": 0.7189548015594482, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0479, + "step": 7550 + }, + { + "epoch": 0.2161543959971408, + "grad_norm": 0.8761522769927979, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0512, + "step": 7560 + }, + { + "epoch": 0.21644031451036455, + "grad_norm": 0.6686418652534485, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.06, + "step": 7570 + }, + { + "epoch": 0.2167262330235883, + "grad_norm": 0.6385156512260437, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0495, + "step": 7580 + }, + { + "epoch": 0.217012151536812, + "grad_norm": 0.4785522520542145, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0477, + "step": 7590 + }, + { + "epoch": 0.21729807005003574, + "grad_norm": 0.883179783821106, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.0472, + "step": 7600 + }, + { + "epoch": 0.21758398856325947, + "grad_norm": 0.5431568026542664, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0383, + "step": 7610 + }, + { + "epoch": 0.2178699070764832, + "grad_norm": 0.7085764408111572, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0465, + "step": 7620 + }, + { + "epoch": 0.21815582558970692, + "grad_norm": 0.4877212643623352, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0517, + "step": 7630 + }, + { + "epoch": 0.21844174410293066, + "grad_norm": 0.6874392032623291, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0555, + "step": 7640 + }, + { + "epoch": 0.2187276626161544, + "grad_norm": 0.9611791372299194, + "learning_rate": 1.776452218695584e-05, + "loss": 0.0415, + "step": 7650 + }, + { + "epoch": 0.21901358112937813, + "grad_norm": 0.3618314862251282, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0601, + "step": 7660 + }, + { + "epoch": 0.21929949964260187, + "grad_norm": 0.5366251468658447, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0383, + "step": 7670 + }, + { + "epoch": 0.21958541815582558, + "grad_norm": 0.6323129534721375, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0536, + "step": 7680 + }, + { + "epoch": 0.21987133666904932, + "grad_norm": 0.4621681571006775, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0442, + "step": 7690 + }, + { + "epoch": 0.22015725518227305, + "grad_norm": 0.9297679662704468, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0517, + "step": 7700 + }, + { + "epoch": 0.2204431736954968, + "grad_norm": 0.5950489640235901, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0468, + "step": 7710 + }, + { + "epoch": 0.2207290922087205, + "grad_norm": 0.30251142382621765, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0471, + "step": 7720 + }, + { + "epoch": 0.22101501072194424, + "grad_norm": 0.6247804760932922, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.0488, + "step": 7730 + }, + { + "epoch": 0.22130092923516798, + "grad_norm": 0.7118366360664368, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0567, + "step": 7740 + }, + { + "epoch": 0.2215868477483917, + "grad_norm": 0.6265056133270264, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.06, + "step": 7750 + }, + { + "epoch": 0.22187276626161545, + "grad_norm": 0.7232056260108948, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0393, + "step": 7760 + }, + { + "epoch": 0.22215868477483916, + "grad_norm": 0.7981307506561279, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0518, + "step": 7770 + }, + { + "epoch": 0.2224446032880629, + "grad_norm": 0.4492819011211395, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0425, + "step": 7780 + }, + { + "epoch": 0.22273052180128664, + "grad_norm": 0.578440248966217, + "learning_rate": 1.767371389304538e-05, + "loss": 0.043, + "step": 7790 + }, + { + "epoch": 0.22301644031451037, + "grad_norm": 0.8093826174736023, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0571, + "step": 7800 + }, + { + "epoch": 0.22330235882773408, + "grad_norm": 0.864661455154419, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.0429, + "step": 7810 + }, + { + "epoch": 0.22358827734095782, + "grad_norm": 0.50054532289505, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0404, + "step": 7820 + }, + { + "epoch": 0.22387419585418156, + "grad_norm": 0.5690511465072632, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0406, + "step": 7830 + }, + { + "epoch": 0.2241601143674053, + "grad_norm": 0.7075231671333313, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0478, + "step": 7840 + }, + { + "epoch": 0.22444603288062903, + "grad_norm": 0.6326742768287659, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.065, + "step": 7850 + }, + { + "epoch": 0.22473195139385274, + "grad_norm": 0.48305049538612366, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0456, + "step": 7860 + }, + { + "epoch": 0.22501786990707648, + "grad_norm": 0.6333707571029663, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.048, + "step": 7870 + }, + { + "epoch": 0.22530378842030022, + "grad_norm": 0.6568662524223328, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0422, + "step": 7880 + }, + { + "epoch": 0.22558970693352395, + "grad_norm": 0.6302695870399475, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0589, + "step": 7890 + }, + { + "epoch": 0.22587562544674766, + "grad_norm": 0.6373940110206604, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0504, + "step": 7900 + }, + { + "epoch": 0.2261615439599714, + "grad_norm": 0.7108445167541504, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0486, + "step": 7910 + }, + { + "epoch": 0.22644746247319514, + "grad_norm": 0.5274208784103394, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0693, + "step": 7920 + }, + { + "epoch": 0.22673338098641888, + "grad_norm": 0.4020678997039795, + "learning_rate": 1.758137056131309e-05, + "loss": 0.0461, + "step": 7930 + }, + { + "epoch": 0.2270192994996426, + "grad_norm": 0.5584745407104492, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0376, + "step": 7940 + }, + { + "epoch": 0.22730521801286632, + "grad_norm": 0.6614044904708862, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0461, + "step": 7950 + }, + { + "epoch": 0.22759113652609006, + "grad_norm": 0.506636917591095, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0431, + "step": 7960 + }, + { + "epoch": 0.2278770550393138, + "grad_norm": 0.5168156027793884, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0404, + "step": 7970 + }, + { + "epoch": 0.22816297355253753, + "grad_norm": 0.552480161190033, + "learning_rate": 1.754802282200567e-05, + "loss": 0.0565, + "step": 7980 + }, + { + "epoch": 0.22844889206576124, + "grad_norm": 0.8191191554069519, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0556, + "step": 7990 + }, + { + "epoch": 0.22873481057898498, + "grad_norm": 0.7767695188522339, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0447, + "step": 8000 + }, + { + "epoch": 0.22902072909220872, + "grad_norm": 0.9050281047821045, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0611, + "step": 8010 + }, + { + "epoch": 0.22930664760543246, + "grad_norm": 0.7805314660072327, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0532, + "step": 8020 + }, + { + "epoch": 0.2295925661186562, + "grad_norm": 0.6055987477302551, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0436, + "step": 8030 + }, + { + "epoch": 0.2298784846318799, + "grad_norm": 1.1075741052627563, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.053, + "step": 8040 + }, + { + "epoch": 0.23016440314510364, + "grad_norm": 0.6283855438232422, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0494, + "step": 8050 + }, + { + "epoch": 0.23045032165832738, + "grad_norm": 0.44009697437286377, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.047, + "step": 8060 + }, + { + "epoch": 0.23073624017155112, + "grad_norm": 0.4920162856578827, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0437, + "step": 8070 + }, + { + "epoch": 0.23102215868477483, + "grad_norm": 0.9286724328994751, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0513, + "step": 8080 + }, + { + "epoch": 0.23130807719799856, + "grad_norm": 0.6595107913017273, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0465, + "step": 8090 + }, + { + "epoch": 0.2315939957112223, + "grad_norm": 0.4930933713912964, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0422, + "step": 8100 + }, + { + "epoch": 0.23187991422444604, + "grad_norm": 0.6741859316825867, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0419, + "step": 8110 + }, + { + "epoch": 0.23216583273766978, + "grad_norm": 0.8081800937652588, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0449, + "step": 8120 + }, + { + "epoch": 0.23245175125089348, + "grad_norm": 1.0258036851882935, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0613, + "step": 8130 + }, + { + "epoch": 0.23273766976411722, + "grad_norm": 0.5007345080375671, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0473, + "step": 8140 + }, + { + "epoch": 0.23302358827734096, + "grad_norm": 0.3931804895401001, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0495, + "step": 8150 + }, + { + "epoch": 0.2333095067905647, + "grad_norm": 0.5907166600227356, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0449, + "step": 8160 + }, + { + "epoch": 0.2335954253037884, + "grad_norm": 0.49229851365089417, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0524, + "step": 8170 + }, + { + "epoch": 0.23388134381701214, + "grad_norm": 0.8386240601539612, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.0527, + "step": 8180 + }, + { + "epoch": 0.23416726233023588, + "grad_norm": 0.7806615829467773, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0529, + "step": 8190 + }, + { + "epoch": 0.23445318084345962, + "grad_norm": 0.5716270804405212, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0534, + "step": 8200 + }, + { + "epoch": 0.23473909935668336, + "grad_norm": 1.165761947631836, + "learning_rate": 1.739216409306913e-05, + "loss": 0.0591, + "step": 8210 + }, + { + "epoch": 0.23502501786990707, + "grad_norm": 0.867967426776886, + "learning_rate": 1.738529690353544e-05, + "loss": 0.049, + "step": 8220 + }, + { + "epoch": 0.2353109363831308, + "grad_norm": 0.5809492468833923, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0434, + "step": 8230 + }, + { + "epoch": 0.23559685489635454, + "grad_norm": 0.8418740034103394, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0461, + "step": 8240 + }, + { + "epoch": 0.23588277340957828, + "grad_norm": 0.5811617374420166, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0443, + "step": 8250 + }, + { + "epoch": 0.236168691922802, + "grad_norm": 0.7699318528175354, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0549, + "step": 8260 + }, + { + "epoch": 0.23645461043602573, + "grad_norm": 0.6066992878913879, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0415, + "step": 8270 + }, + { + "epoch": 0.23674052894924946, + "grad_norm": 0.7775973677635193, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0619, + "step": 8280 + }, + { + "epoch": 0.2370264474624732, + "grad_norm": 0.8320962190628052, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.048, + "step": 8290 + }, + { + "epoch": 0.23731236597569694, + "grad_norm": 0.7203818559646606, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0594, + "step": 8300 + }, + { + "epoch": 0.23759828448892065, + "grad_norm": 0.7634598612785339, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0614, + "step": 8310 + }, + { + "epoch": 0.23788420300214438, + "grad_norm": 0.557575523853302, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0517, + "step": 8320 + }, + { + "epoch": 0.23817012151536812, + "grad_norm": 1.0139968395233154, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0457, + "step": 8330 + }, + { + "epoch": 0.23845604002859186, + "grad_norm": 0.5543113946914673, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.048, + "step": 8340 + }, + { + "epoch": 0.23874195854181557, + "grad_norm": 1.0122590065002441, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.0509, + "step": 8350 + }, + { + "epoch": 0.2390278770550393, + "grad_norm": 0.8776134252548218, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0475, + "step": 8360 + }, + { + "epoch": 0.23931379556826304, + "grad_norm": 0.41230106353759766, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0467, + "step": 8370 + }, + { + "epoch": 0.23959971408148678, + "grad_norm": 0.5460986495018005, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0455, + "step": 8380 + }, + { + "epoch": 0.23988563259471052, + "grad_norm": 0.5896333456039429, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.051, + "step": 8390 + }, + { + "epoch": 0.24017155110793423, + "grad_norm": 0.536375105381012, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0432, + "step": 8400 + }, + { + "epoch": 0.24045746962115797, + "grad_norm": 0.7597050666809082, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0459, + "step": 8410 + }, + { + "epoch": 0.2407433881343817, + "grad_norm": 0.6669795513153076, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0584, + "step": 8420 + }, + { + "epoch": 0.24102930664760544, + "grad_norm": 0.3614502251148224, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.039, + "step": 8430 + }, + { + "epoch": 0.24131522516082915, + "grad_norm": 0.5618023872375488, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0394, + "step": 8440 + }, + { + "epoch": 0.2416011436740529, + "grad_norm": 0.5897185802459717, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0502, + "step": 8450 + }, + { + "epoch": 0.24188706218727662, + "grad_norm": 0.5622876882553101, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0382, + "step": 8460 + }, + { + "epoch": 0.24217298070050036, + "grad_norm": 0.5639696717262268, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0652, + "step": 8470 + }, + { + "epoch": 0.2424588992137241, + "grad_norm": 0.5686836242675781, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.0609, + "step": 8480 + }, + { + "epoch": 0.2427448177269478, + "grad_norm": 0.7248222827911377, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0512, + "step": 8490 + }, + { + "epoch": 0.24303073624017155, + "grad_norm": 0.6157225370407104, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0449, + "step": 8500 + }, + { + "epoch": 0.24331665475339528, + "grad_norm": 1.1660966873168945, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0427, + "step": 8510 + }, + { + "epoch": 0.24360257326661902, + "grad_norm": 1.1242589950561523, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0514, + "step": 8520 + }, + { + "epoch": 0.24388849177984273, + "grad_norm": 0.6135080456733704, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0491, + "step": 8530 + }, + { + "epoch": 0.24417441029306647, + "grad_norm": 0.41474589705467224, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0427, + "step": 8540 + }, + { + "epoch": 0.2444603288062902, + "grad_norm": 0.42195969820022583, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0486, + "step": 8550 + }, + { + "epoch": 0.24474624731951394, + "grad_norm": 0.3914433717727661, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0411, + "step": 8560 + }, + { + "epoch": 0.24503216583273768, + "grad_norm": 0.7590876817703247, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0439, + "step": 8570 + }, + { + "epoch": 0.2453180843459614, + "grad_norm": 0.4362296164035797, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0466, + "step": 8580 + }, + { + "epoch": 0.24560400285918513, + "grad_norm": 0.467949241399765, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.0502, + "step": 8590 + }, + { + "epoch": 0.24588992137240887, + "grad_norm": 0.4731729328632355, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0599, + "step": 8600 + }, + { + "epoch": 0.2461758398856326, + "grad_norm": 0.491644948720932, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0524, + "step": 8610 + }, + { + "epoch": 0.2464617583988563, + "grad_norm": 0.5254928469657898, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0469, + "step": 8620 + }, + { + "epoch": 0.24674767691208005, + "grad_norm": 0.5721238255500793, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0493, + "step": 8630 + }, + { + "epoch": 0.2470335954253038, + "grad_norm": 0.5806096792221069, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0391, + "step": 8640 + }, + { + "epoch": 0.24731951393852752, + "grad_norm": 0.6683222055435181, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0502, + "step": 8650 + }, + { + "epoch": 0.24760543245175126, + "grad_norm": 0.41728726029396057, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0411, + "step": 8660 + }, + { + "epoch": 0.24789135096497497, + "grad_norm": 0.6001113653182983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0413, + "step": 8670 + }, + { + "epoch": 0.2481772694781987, + "grad_norm": 0.43813610076904297, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0389, + "step": 8680 + }, + { + "epoch": 0.24846318799142245, + "grad_norm": 1.5533791780471802, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0597, + "step": 8690 + }, + { + "epoch": 0.24874910650464618, + "grad_norm": 1.175837755203247, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0473, + "step": 8700 + }, + { + "epoch": 0.2490350250178699, + "grad_norm": 0.4798300862312317, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0459, + "step": 8710 + }, + { + "epoch": 0.24932094353109363, + "grad_norm": 0.7334772944450378, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0432, + "step": 8720 + }, + { + "epoch": 0.24960686204431737, + "grad_norm": 0.9633310437202454, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.05, + "step": 8730 + }, + { + "epoch": 0.2498927805575411, + "grad_norm": 0.7353480458259583, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.05, + "step": 8740 + }, + { + "epoch": 0.25017869907076484, + "grad_norm": 0.5958748459815979, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0428, + "step": 8750 + }, + { + "epoch": 0.2504646175839886, + "grad_norm": 0.8538689613342285, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0498, + "step": 8760 + }, + { + "epoch": 0.2507505360972123, + "grad_norm": 0.606607973575592, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0409, + "step": 8770 + }, + { + "epoch": 0.251036454610436, + "grad_norm": 0.3999035060405731, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0714, + "step": 8780 + }, + { + "epoch": 0.25132237312365974, + "grad_norm": 0.807314932346344, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.057, + "step": 8790 + }, + { + "epoch": 0.2516082916368835, + "grad_norm": 0.5238217115402222, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0485, + "step": 8800 + }, + { + "epoch": 0.2518942101501072, + "grad_norm": 1.6465950012207031, + "learning_rate": 1.696714953556411e-05, + "loss": 0.056, + "step": 8810 + }, + { + "epoch": 0.25218012866333095, + "grad_norm": 0.6568214297294617, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0424, + "step": 8820 + }, + { + "epoch": 0.2524660471765547, + "grad_norm": 0.4695168137550354, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0456, + "step": 8830 + }, + { + "epoch": 0.2527519656897784, + "grad_norm": 0.5652263164520264, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0527, + "step": 8840 + }, + { + "epoch": 0.25303788420300216, + "grad_norm": 0.8887180685997009, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0441, + "step": 8850 + }, + { + "epoch": 0.2533238027162259, + "grad_norm": 0.8288971781730652, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.0513, + "step": 8860 + }, + { + "epoch": 0.2536097212294496, + "grad_norm": 0.8606051802635193, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0416, + "step": 8870 + }, + { + "epoch": 0.2538956397426733, + "grad_norm": 0.7235842347145081, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0481, + "step": 8880 + }, + { + "epoch": 0.25418155825589706, + "grad_norm": 0.9602673053741455, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.0465, + "step": 8890 + }, + { + "epoch": 0.2544674767691208, + "grad_norm": 0.6431217789649963, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0489, + "step": 8900 + }, + { + "epoch": 0.25475339528234453, + "grad_norm": 0.42215701937675476, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0376, + "step": 8910 + }, + { + "epoch": 0.25503931379556827, + "grad_norm": 0.5899976491928101, + "learning_rate": 1.688644181174108e-05, + "loss": 0.048, + "step": 8920 + }, + { + "epoch": 0.255325232308792, + "grad_norm": 0.9504411816596985, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.047, + "step": 8930 + }, + { + "epoch": 0.25561115082201574, + "grad_norm": 0.5808438062667847, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0535, + "step": 8940 + }, + { + "epoch": 0.2558970693352395, + "grad_norm": 0.3811270594596863, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.0418, + "step": 8950 + }, + { + "epoch": 0.25618298784846316, + "grad_norm": 1.0257363319396973, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.0548, + "step": 8960 + }, + { + "epoch": 0.2564689063616869, + "grad_norm": 0.7294469475746155, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0569, + "step": 8970 + }, + { + "epoch": 0.25675482487491064, + "grad_norm": 0.4967000484466553, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.0488, + "step": 8980 + }, + { + "epoch": 0.2570407433881344, + "grad_norm": 0.9160422086715698, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.0471, + "step": 8990 + }, + { + "epoch": 0.2573266619013581, + "grad_norm": 0.5125435590744019, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0487, + "step": 9000 + }, + { + "epoch": 0.25761258041458185, + "grad_norm": 0.5617201328277588, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0597, + "step": 9010 + }, + { + "epoch": 0.2578984989278056, + "grad_norm": 0.7771851420402527, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0485, + "step": 9020 + }, + { + "epoch": 0.2581844174410293, + "grad_norm": 0.8434289693832397, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0429, + "step": 9030 + }, + { + "epoch": 0.25847033595425306, + "grad_norm": 0.513541042804718, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0488, + "step": 9040 + }, + { + "epoch": 0.25875625446747674, + "grad_norm": 1.0142096281051636, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0517, + "step": 9050 + }, + { + "epoch": 0.2590421729807005, + "grad_norm": 0.6343669295310974, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.049, + "step": 9060 + }, + { + "epoch": 0.2593280914939242, + "grad_norm": 0.33996936678886414, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.037, + "step": 9070 + }, + { + "epoch": 0.25961401000714796, + "grad_norm": 0.5964446663856506, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0448, + "step": 9080 + }, + { + "epoch": 0.2598999285203717, + "grad_norm": 0.4989728629589081, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0463, + "step": 9090 + }, + { + "epoch": 0.26018584703359543, + "grad_norm": 0.7735986113548279, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0576, + "step": 9100 + }, + { + "epoch": 0.26047176554681917, + "grad_norm": 1.2520418167114258, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0577, + "step": 9110 + }, + { + "epoch": 0.2607576840600429, + "grad_norm": 0.45247936248779297, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0458, + "step": 9120 + }, + { + "epoch": 0.26104360257326664, + "grad_norm": 0.8944823145866394, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0498, + "step": 9130 + }, + { + "epoch": 0.2613295210864903, + "grad_norm": 0.8308315277099609, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.0545, + "step": 9140 + }, + { + "epoch": 0.26161543959971406, + "grad_norm": 0.6838778853416443, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0419, + "step": 9150 + }, + { + "epoch": 0.2619013581129378, + "grad_norm": 1.5998408794403076, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0591, + "step": 9160 + }, + { + "epoch": 0.26218727662616154, + "grad_norm": 0.8548596501350403, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.04, + "step": 9170 + }, + { + "epoch": 0.2624731951393853, + "grad_norm": 0.5784913897514343, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0464, + "step": 9180 + }, + { + "epoch": 0.262759113652609, + "grad_norm": 1.490502953529358, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0672, + "step": 9190 + }, + { + "epoch": 0.26304503216583275, + "grad_norm": 0.8950793743133545, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0532, + "step": 9200 + }, + { + "epoch": 0.2633309506790565, + "grad_norm": 0.5513611435890198, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.0416, + "step": 9210 + }, + { + "epoch": 0.2636168691922802, + "grad_norm": 1.0512864589691162, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0487, + "step": 9220 + }, + { + "epoch": 0.2639027877055039, + "grad_norm": 0.48180028796195984, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0543, + "step": 9230 + }, + { + "epoch": 0.26418870621872764, + "grad_norm": 0.5451590418815613, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0553, + "step": 9240 + }, + { + "epoch": 0.2644746247319514, + "grad_norm": 0.6986148953437805, + "learning_rate": 1.663934987558109e-05, + "loss": 0.0523, + "step": 9250 + }, + { + "epoch": 0.2647605432451751, + "grad_norm": 0.5977929830551147, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0446, + "step": 9260 + }, + { + "epoch": 0.26504646175839885, + "grad_norm": 0.6042361855506897, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0716, + "step": 9270 + }, + { + "epoch": 0.2653323802716226, + "grad_norm": 0.473418265581131, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0378, + "step": 9280 + }, + { + "epoch": 0.26561829878484633, + "grad_norm": 0.9332809448242188, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0484, + "step": 9290 + }, + { + "epoch": 0.26590421729807007, + "grad_norm": 0.5209246277809143, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0439, + "step": 9300 + }, + { + "epoch": 0.2661901358112938, + "grad_norm": 0.5742560625076294, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.0468, + "step": 9310 + }, + { + "epoch": 0.2664760543245175, + "grad_norm": 0.585503876209259, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0507, + "step": 9320 + }, + { + "epoch": 0.2667619728377412, + "grad_norm": 0.5254957675933838, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0436, + "step": 9330 + }, + { + "epoch": 0.26704789135096496, + "grad_norm": 0.48314452171325684, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0417, + "step": 9340 + }, + { + "epoch": 0.2673338098641887, + "grad_norm": 0.630020022392273, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0425, + "step": 9350 + }, + { + "epoch": 0.26761972837741244, + "grad_norm": 0.3545299470424652, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0338, + "step": 9360 + }, + { + "epoch": 0.2679056468906362, + "grad_norm": 0.6934211850166321, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0445, + "step": 9370 + }, + { + "epoch": 0.2681915654038599, + "grad_norm": 0.6544952392578125, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0451, + "step": 9380 + }, + { + "epoch": 0.26847748391708365, + "grad_norm": 0.4581946134567261, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0422, + "step": 9390 + }, + { + "epoch": 0.2687634024303074, + "grad_norm": 0.6338506937026978, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0576, + "step": 9400 + }, + { + "epoch": 0.26904932094353107, + "grad_norm": 0.8165014386177063, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0474, + "step": 9410 + }, + { + "epoch": 0.2693352394567548, + "grad_norm": 0.793222188949585, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0546, + "step": 9420 + }, + { + "epoch": 0.26962115796997854, + "grad_norm": 0.3669852316379547, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0461, + "step": 9430 + }, + { + "epoch": 0.2699070764832023, + "grad_norm": 0.7339810729026794, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0433, + "step": 9440 + }, + { + "epoch": 0.270192994996426, + "grad_norm": 0.4948982298374176, + "learning_rate": 1.648606940465527e-05, + "loss": 0.048, + "step": 9450 + }, + { + "epoch": 0.27047891350964975, + "grad_norm": 0.4681016206741333, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0437, + "step": 9460 + }, + { + "epoch": 0.2707648320228735, + "grad_norm": 0.5091472864151001, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0576, + "step": 9470 + }, + { + "epoch": 0.27105075053609723, + "grad_norm": 0.5683515071868896, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0503, + "step": 9480 + }, + { + "epoch": 0.27133666904932097, + "grad_norm": 0.626844048500061, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0495, + "step": 9490 + }, + { + "epoch": 0.27162258756254465, + "grad_norm": 0.6757943034172058, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0495, + "step": 9500 + }, + { + "epoch": 0.2719085060757684, + "grad_norm": 0.7049196362495422, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0579, + "step": 9510 + }, + { + "epoch": 0.2721944245889921, + "grad_norm": 0.6469181776046753, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.051, + "step": 9520 + }, + { + "epoch": 0.27248034310221586, + "grad_norm": 0.5414942502975464, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0433, + "step": 9530 + }, + { + "epoch": 0.2727662616154396, + "grad_norm": 0.5642798542976379, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0495, + "step": 9540 + }, + { + "epoch": 0.27305218012866334, + "grad_norm": 1.0527595281600952, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0445, + "step": 9550 + }, + { + "epoch": 0.2733380986418871, + "grad_norm": 0.8501784801483154, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0627, + "step": 9560 + }, + { + "epoch": 0.2736240171551108, + "grad_norm": 0.7892033457756042, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.043, + "step": 9570 + }, + { + "epoch": 0.27390993566833455, + "grad_norm": 0.3588624596595764, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0512, + "step": 9580 + }, + { + "epoch": 0.27419585418155823, + "grad_norm": 0.7474772930145264, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0468, + "step": 9590 + }, + { + "epoch": 0.27448177269478197, + "grad_norm": 0.6217718124389648, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0435, + "step": 9600 + }, + { + "epoch": 0.2747676912080057, + "grad_norm": 0.7711623907089233, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.054, + "step": 9610 + }, + { + "epoch": 0.27505360972122944, + "grad_norm": 0.8171371221542358, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0371, + "step": 9620 + }, + { + "epoch": 0.2753395282344532, + "grad_norm": 0.8668338060379028, + "learning_rate": 1.634591312387623e-05, + "loss": 0.055, + "step": 9630 + }, + { + "epoch": 0.2756254467476769, + "grad_norm": 0.5683940052986145, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0478, + "step": 9640 + }, + { + "epoch": 0.27591136526090065, + "grad_norm": 0.44098007678985596, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.0531, + "step": 9650 + }, + { + "epoch": 0.2761972837741244, + "grad_norm": 0.8305087685585022, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0462, + "step": 9660 + }, + { + "epoch": 0.27648320228734813, + "grad_norm": 0.9088799953460693, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0489, + "step": 9670 + }, + { + "epoch": 0.2767691208005718, + "grad_norm": 0.5590132474899292, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0486, + "step": 9680 + }, + { + "epoch": 0.27705503931379555, + "grad_norm": 0.776713490486145, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0443, + "step": 9690 + }, + { + "epoch": 0.2773409578270193, + "grad_norm": 0.6107578873634338, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0461, + "step": 9700 + }, + { + "epoch": 0.277626876340243, + "grad_norm": 0.4635901153087616, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0397, + "step": 9710 + }, + { + "epoch": 0.27791279485346676, + "grad_norm": 0.4220955967903137, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0463, + "step": 9720 + }, + { + "epoch": 0.2781987133666905, + "grad_norm": 0.4947739243507385, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0397, + "step": 9730 + }, + { + "epoch": 0.27848463187991424, + "grad_norm": 0.5589033961296082, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0426, + "step": 9740 + }, + { + "epoch": 0.278770550393138, + "grad_norm": 0.4904254972934723, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0458, + "step": 9750 + }, + { + "epoch": 0.2790564689063617, + "grad_norm": 0.34956127405166626, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0551, + "step": 9760 + }, + { + "epoch": 0.2793423874195854, + "grad_norm": 0.7638002038002014, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0424, + "step": 9770 + }, + { + "epoch": 0.27962830593280913, + "grad_norm": 0.48727869987487793, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0451, + "step": 9780 + }, + { + "epoch": 0.27991422444603287, + "grad_norm": 0.7314761281013489, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.0523, + "step": 9790 + }, + { + "epoch": 0.2802001429592566, + "grad_norm": 0.5017405152320862, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0423, + "step": 9800 + }, + { + "epoch": 0.28048606147248034, + "grad_norm": 0.8375383615493774, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0435, + "step": 9810 + }, + { + "epoch": 0.2807719799857041, + "grad_norm": 0.8702818155288696, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0487, + "step": 9820 + }, + { + "epoch": 0.2810578984989278, + "grad_norm": 0.4649866223335266, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0483, + "step": 9830 + }, + { + "epoch": 0.28134381701215155, + "grad_norm": 0.7464607357978821, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0463, + "step": 9840 + }, + { + "epoch": 0.2816297355253753, + "grad_norm": 0.48055607080459595, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0418, + "step": 9850 + }, + { + "epoch": 0.281915654038599, + "grad_norm": 0.9723506569862366, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0433, + "step": 9860 + }, + { + "epoch": 0.2822015725518227, + "grad_norm": 0.8859265446662903, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0605, + "step": 9870 + }, + { + "epoch": 0.28248749106504645, + "grad_norm": 0.8236640691757202, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0441, + "step": 9880 + }, + { + "epoch": 0.2827734095782702, + "grad_norm": 0.6617199778556824, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0515, + "step": 9890 + }, + { + "epoch": 0.2830593280914939, + "grad_norm": 0.8017821907997131, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0503, + "step": 9900 + }, + { + "epoch": 0.28334524660471766, + "grad_norm": 1.070827603340149, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0485, + "step": 9910 + }, + { + "epoch": 0.2836311651179414, + "grad_norm": 1.021888256072998, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0479, + "step": 9920 + }, + { + "epoch": 0.28391708363116513, + "grad_norm": 0.34402501583099365, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0391, + "step": 9930 + }, + { + "epoch": 0.28420300214438887, + "grad_norm": 0.58541339635849, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0461, + "step": 9940 + }, + { + "epoch": 0.28448892065761255, + "grad_norm": 0.8062207102775574, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0553, + "step": 9950 + }, + { + "epoch": 0.2847748391708363, + "grad_norm": 0.6435661315917969, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0536, + "step": 9960 + }, + { + "epoch": 0.28506075768406003, + "grad_norm": 0.5670832395553589, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0405, + "step": 9970 + }, + { + "epoch": 0.28534667619728377, + "grad_norm": 0.45282548666000366, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0458, + "step": 9980 + }, + { + "epoch": 0.2856325947105075, + "grad_norm": 0.42272916436195374, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0392, + "step": 9990 + }, + { + "epoch": 0.28591851322373124, + "grad_norm": 0.5791928768157959, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0453, + "step": 10000 + }, + { + "epoch": 0.286204431736955, + "grad_norm": 0.9841408729553223, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.052, + "step": 10010 + }, + { + "epoch": 0.2864903502501787, + "grad_norm": 0.8658338785171509, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0461, + "step": 10020 + }, + { + "epoch": 0.28677626876340245, + "grad_norm": 0.624788224697113, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0416, + "step": 10030 + }, + { + "epoch": 0.28706218727662614, + "grad_norm": 0.6108028888702393, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0548, + "step": 10040 + }, + { + "epoch": 0.2873481057898499, + "grad_norm": 0.7907708883285522, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0406, + "step": 10050 + }, + { + "epoch": 0.2876340243030736, + "grad_norm": 0.7695413827896118, + "learning_rate": 1.60029690609047e-05, + "loss": 0.061, + "step": 10060 + }, + { + "epoch": 0.28791994281629735, + "grad_norm": 0.4407683312892914, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0483, + "step": 10070 + }, + { + "epoch": 0.2882058613295211, + "grad_norm": 0.6242743730545044, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.039, + "step": 10080 + }, + { + "epoch": 0.2884917798427448, + "grad_norm": 0.8752113580703735, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0433, + "step": 10090 + }, + { + "epoch": 0.28877769835596856, + "grad_norm": 0.8834511041641235, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0348, + "step": 10100 + }, + { + "epoch": 0.2890636168691923, + "grad_norm": 1.0036063194274902, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0593, + "step": 10110 + }, + { + "epoch": 0.28934953538241603, + "grad_norm": 0.5511205196380615, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0459, + "step": 10120 + }, + { + "epoch": 0.2896354538956397, + "grad_norm": 0.7717337012290955, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0381, + "step": 10130 + }, + { + "epoch": 0.28992137240886345, + "grad_norm": 1.123363971710205, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0462, + "step": 10140 + }, + { + "epoch": 0.2902072909220872, + "grad_norm": 0.6212007403373718, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0446, + "step": 10150 + }, + { + "epoch": 0.29049320943531093, + "grad_norm": 0.5547964572906494, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0362, + "step": 10160 + }, + { + "epoch": 0.29077912794853467, + "grad_norm": 0.593225359916687, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0462, + "step": 10170 + }, + { + "epoch": 0.2910650464617584, + "grad_norm": 0.5569560527801514, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0508, + "step": 10180 + }, + { + "epoch": 0.29135096497498214, + "grad_norm": 0.5464656949043274, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0399, + "step": 10190 + }, + { + "epoch": 0.2916368834882059, + "grad_norm": 1.2456778287887573, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0494, + "step": 10200 + }, + { + "epoch": 0.2919228020014296, + "grad_norm": 0.7862445712089539, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0551, + "step": 10210 + }, + { + "epoch": 0.2922087205146533, + "grad_norm": 0.745941698551178, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0469, + "step": 10220 + }, + { + "epoch": 0.29249463902787703, + "grad_norm": 0.7562586665153503, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0492, + "step": 10230 + }, + { + "epoch": 0.29278055754110077, + "grad_norm": 0.659205973148346, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0453, + "step": 10240 + }, + { + "epoch": 0.2930664760543245, + "grad_norm": 0.6925905346870422, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0463, + "step": 10250 + }, + { + "epoch": 0.29335239456754825, + "grad_norm": 0.479115754365921, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0395, + "step": 10260 + }, + { + "epoch": 0.293638313080772, + "grad_norm": 0.5085121393203735, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0504, + "step": 10270 + }, + { + "epoch": 0.2939242315939957, + "grad_norm": 0.46833914518356323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0411, + "step": 10280 + }, + { + "epoch": 0.29421015010721946, + "grad_norm": 0.4534672796726227, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0491, + "step": 10290 + }, + { + "epoch": 0.2944960686204432, + "grad_norm": 0.5704737305641174, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0391, + "step": 10300 + }, + { + "epoch": 0.2947819871336669, + "grad_norm": 1.0342676639556885, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0681, + "step": 10310 + }, + { + "epoch": 0.2950679056468906, + "grad_norm": 0.5002169013023376, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0429, + "step": 10320 + }, + { + "epoch": 0.29535382416011435, + "grad_norm": 0.5565863847732544, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0575, + "step": 10330 + }, + { + "epoch": 0.2956397426733381, + "grad_norm": 0.7826551198959351, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0448, + "step": 10340 + }, + { + "epoch": 0.29592566118656183, + "grad_norm": 0.7019012570381165, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0436, + "step": 10350 + }, + { + "epoch": 0.29621157969978557, + "grad_norm": 0.8324534893035889, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0503, + "step": 10360 + }, + { + "epoch": 0.2964974982130093, + "grad_norm": 0.7064073085784912, + "learning_rate": 1.574895332125391e-05, + "loss": 0.041, + "step": 10370 + }, + { + "epoch": 0.29678341672623304, + "grad_norm": 0.5634047389030457, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0474, + "step": 10380 + }, + { + "epoch": 0.2970693352394568, + "grad_norm": 0.8504926562309265, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0502, + "step": 10390 + }, + { + "epoch": 0.29735525375268046, + "grad_norm": 0.508313775062561, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0368, + "step": 10400 + }, + { + "epoch": 0.2976411722659042, + "grad_norm": 0.5851112008094788, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0472, + "step": 10410 + }, + { + "epoch": 0.29792709077912793, + "grad_norm": 0.5689557790756226, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0479, + "step": 10420 + }, + { + "epoch": 0.29821300929235167, + "grad_norm": 0.5026743412017822, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0406, + "step": 10430 + }, + { + "epoch": 0.2984989278055754, + "grad_norm": 0.5662751197814941, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0441, + "step": 10440 + }, + { + "epoch": 0.29878484631879915, + "grad_norm": 0.899709939956665, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0432, + "step": 10450 + }, + { + "epoch": 0.2990707648320229, + "grad_norm": 0.4681940972805023, + "learning_rate": 1.567419089313346e-05, + "loss": 0.054, + "step": 10460 + }, + { + "epoch": 0.2993566833452466, + "grad_norm": 0.39646071195602417, + "learning_rate": 1.56658563993822e-05, + "loss": 0.0375, + "step": 10470 + }, + { + "epoch": 0.29964260185847036, + "grad_norm": 1.204815149307251, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0487, + "step": 10480 + }, + { + "epoch": 0.29992852037169404, + "grad_norm": 0.4507630467414856, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0516, + "step": 10490 + }, + { + "epoch": 0.3002144388849178, + "grad_norm": 0.9783321022987366, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0642, + "step": 10500 + }, + { + "epoch": 0.3005003573981415, + "grad_norm": 0.5406969785690308, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0447, + "step": 10510 + }, + { + "epoch": 0.30078627591136525, + "grad_norm": 0.44153860211372375, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0449, + "step": 10520 + }, + { + "epoch": 0.301072194424589, + "grad_norm": 0.5723687410354614, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0548, + "step": 10530 + }, + { + "epoch": 0.3013581129378127, + "grad_norm": 0.4453120529651642, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0434, + "step": 10540 + }, + { + "epoch": 0.30164403145103647, + "grad_norm": 0.34224697947502136, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0385, + "step": 10550 + }, + { + "epoch": 0.3019299499642602, + "grad_norm": 0.6389157176017761, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0569, + "step": 10560 + }, + { + "epoch": 0.30221586847748394, + "grad_norm": 0.5845953822135925, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0467, + "step": 10570 + }, + { + "epoch": 0.3025017869907076, + "grad_norm": 0.6581900119781494, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0422, + "step": 10580 + }, + { + "epoch": 0.30278770550393136, + "grad_norm": 0.4964161813259125, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0428, + "step": 10590 + }, + { + "epoch": 0.3030736240171551, + "grad_norm": 0.635380208492279, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.0442, + "step": 10600 + }, + { + "epoch": 0.30335954253037883, + "grad_norm": 0.9795969128608704, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0517, + "step": 10610 + }, + { + "epoch": 0.30364546104360257, + "grad_norm": 0.9987231492996216, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0514, + "step": 10620 + }, + { + "epoch": 0.3039313795568263, + "grad_norm": 0.6384946703910828, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0471, + "step": 10630 + }, + { + "epoch": 0.30421729807005005, + "grad_norm": 0.49352115392684937, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0351, + "step": 10640 + }, + { + "epoch": 0.3045032165832738, + "grad_norm": 0.45028480887413025, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0438, + "step": 10650 + }, + { + "epoch": 0.3047891350964975, + "grad_norm": 0.5717794895172119, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0491, + "step": 10660 + }, + { + "epoch": 0.3050750536097212, + "grad_norm": 0.5436326265335083, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0407, + "step": 10670 + }, + { + "epoch": 0.30536097212294494, + "grad_norm": 0.7777692675590515, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0436, + "step": 10680 + }, + { + "epoch": 0.3056468906361687, + "grad_norm": 0.6597929000854492, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0407, + "step": 10690 + }, + { + "epoch": 0.3059328091493924, + "grad_norm": 0.6059311032295227, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0481, + "step": 10700 + }, + { + "epoch": 0.30621872766261615, + "grad_norm": 0.5530681014060974, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0418, + "step": 10710 + }, + { + "epoch": 0.3065046461758399, + "grad_norm": 0.5778716802597046, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0429, + "step": 10720 + }, + { + "epoch": 0.3067905646890636, + "grad_norm": 0.4573792517185211, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0586, + "step": 10730 + }, + { + "epoch": 0.30707648320228736, + "grad_norm": 0.8193615078926086, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0474, + "step": 10740 + }, + { + "epoch": 0.3073624017155111, + "grad_norm": 0.9410123229026794, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0433, + "step": 10750 + }, + { + "epoch": 0.3076483202287348, + "grad_norm": 0.8244432806968689, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0462, + "step": 10760 + }, + { + "epoch": 0.3079342387419585, + "grad_norm": 0.644899845123291, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0479, + "step": 10770 + }, + { + "epoch": 0.30822015725518226, + "grad_norm": 0.28044867515563965, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.04, + "step": 10780 + }, + { + "epoch": 0.308506075768406, + "grad_norm": 0.6538394093513489, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0406, + "step": 10790 + }, + { + "epoch": 0.30879199428162973, + "grad_norm": 0.9572822451591492, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0505, + "step": 10800 + }, + { + "epoch": 0.30907791279485347, + "grad_norm": 0.539826512336731, + "learning_rate": 1.537928999540189e-05, + "loss": 0.05, + "step": 10810 + }, + { + "epoch": 0.3093638313080772, + "grad_norm": 0.801988959312439, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0454, + "step": 10820 + }, + { + "epoch": 0.30964974982130095, + "grad_norm": 0.57478928565979, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.039, + "step": 10830 + }, + { + "epoch": 0.3099356683345247, + "grad_norm": 0.6313017010688782, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0384, + "step": 10840 + }, + { + "epoch": 0.31022158684774837, + "grad_norm": 0.507997989654541, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0365, + "step": 10850 + }, + { + "epoch": 0.3105075053609721, + "grad_norm": 0.5152313709259033, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0487, + "step": 10860 + }, + { + "epoch": 0.31079342387419584, + "grad_norm": 0.6123478412628174, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0405, + "step": 10870 + }, + { + "epoch": 0.3110793423874196, + "grad_norm": 1.079551100730896, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0443, + "step": 10880 + }, + { + "epoch": 0.3113652609006433, + "grad_norm": 0.39866960048675537, + "learning_rate": 1.531098472380285e-05, + "loss": 0.04, + "step": 10890 + }, + { + "epoch": 0.31165117941386705, + "grad_norm": 0.3715427815914154, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0387, + "step": 10900 + }, + { + "epoch": 0.3119370979270908, + "grad_norm": 0.7201068997383118, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.054, + "step": 10910 + }, + { + "epoch": 0.3122230164403145, + "grad_norm": 0.9512631893157959, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0383, + "step": 10920 + }, + { + "epoch": 0.31250893495353826, + "grad_norm": 0.5948206186294556, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0472, + "step": 10930 + }, + { + "epoch": 0.31279485346676195, + "grad_norm": 0.7174249291419983, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0437, + "step": 10940 + }, + { + "epoch": 0.3130807719799857, + "grad_norm": 0.6190982460975647, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.0383, + "step": 10950 + }, + { + "epoch": 0.3133666904932094, + "grad_norm": 0.7733815312385559, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0327, + "step": 10960 + }, + { + "epoch": 0.31365260900643316, + "grad_norm": 1.2995271682739258, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0427, + "step": 10970 + }, + { + "epoch": 0.3139385275196569, + "grad_norm": 1.1102336645126343, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.04, + "step": 10980 + }, + { + "epoch": 0.31422444603288063, + "grad_norm": 0.7618277668952942, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0488, + "step": 10990 + }, + { + "epoch": 0.31451036454610437, + "grad_norm": 0.5355142951011658, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0436, + "step": 11000 + }, + { + "epoch": 0.3147962830593281, + "grad_norm": 1.3410072326660156, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0463, + "step": 11010 + }, + { + "epoch": 0.31508220157255185, + "grad_norm": 0.7810450196266174, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0493, + "step": 11020 + }, + { + "epoch": 0.3153681200857755, + "grad_norm": 0.6452206373214722, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0354, + "step": 11030 + }, + { + "epoch": 0.31565403859899926, + "grad_norm": 1.037593126296997, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.0418, + "step": 11040 + }, + { + "epoch": 0.315939957112223, + "grad_norm": 0.7032834887504578, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0431, + "step": 11050 + }, + { + "epoch": 0.31622587562544674, + "grad_norm": 0.5168939232826233, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.0472, + "step": 11060 + }, + { + "epoch": 0.3165117941386705, + "grad_norm": 0.5239925384521484, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0438, + "step": 11070 + }, + { + "epoch": 0.3167977126518942, + "grad_norm": 0.8209654688835144, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0506, + "step": 11080 + }, + { + "epoch": 0.31708363116511795, + "grad_norm": 0.5318232178688049, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0516, + "step": 11090 + }, + { + "epoch": 0.3173695496783417, + "grad_norm": 1.0823673009872437, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0482, + "step": 11100 + }, + { + "epoch": 0.3176554681915654, + "grad_norm": 0.6691215634346008, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.046, + "step": 11110 + }, + { + "epoch": 0.3179413867047891, + "grad_norm": 0.4862753450870514, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0423, + "step": 11120 + }, + { + "epoch": 0.31822730521801285, + "grad_norm": 0.4640316963195801, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0433, + "step": 11130 + }, + { + "epoch": 0.3185132237312366, + "grad_norm": 0.7841521501541138, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0445, + "step": 11140 + }, + { + "epoch": 0.3187991422444603, + "grad_norm": 0.6809426546096802, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0518, + "step": 11150 + }, + { + "epoch": 0.31908506075768406, + "grad_norm": 0.6195946931838989, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0569, + "step": 11160 + }, + { + "epoch": 0.3193709792709078, + "grad_norm": 0.7289860248565674, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0487, + "step": 11170 + }, + { + "epoch": 0.31965689778413153, + "grad_norm": 0.5575736165046692, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0409, + "step": 11180 + }, + { + "epoch": 0.31994281629735527, + "grad_norm": 0.8619267344474792, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0424, + "step": 11190 + }, + { + "epoch": 0.320228734810579, + "grad_norm": 0.740242063999176, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0474, + "step": 11200 + }, + { + "epoch": 0.3205146533238027, + "grad_norm": 0.4169894754886627, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.0395, + "step": 11210 + }, + { + "epoch": 0.3208005718370264, + "grad_norm": 0.5773794651031494, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0414, + "step": 11220 + }, + { + "epoch": 0.32108649035025016, + "grad_norm": 0.4941500723361969, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0484, + "step": 11230 + }, + { + "epoch": 0.3213724088634739, + "grad_norm": 0.7985579371452332, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.051, + "step": 11240 + }, + { + "epoch": 0.32165832737669764, + "grad_norm": 0.5262066721916199, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0434, + "step": 11250 + }, + { + "epoch": 0.3219442458899214, + "grad_norm": 0.4074312150478363, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0428, + "step": 11260 + }, + { + "epoch": 0.3222301644031451, + "grad_norm": 1.0757715702056885, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0468, + "step": 11270 + }, + { + "epoch": 0.32251608291636885, + "grad_norm": 0.7281575202941895, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0386, + "step": 11280 + }, + { + "epoch": 0.3228020014295926, + "grad_norm": 0.35078516602516174, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0413, + "step": 11290 + }, + { + "epoch": 0.32308791994281627, + "grad_norm": 0.5642452836036682, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0435, + "step": 11300 + }, + { + "epoch": 0.32337383845604, + "grad_norm": 0.5326974987983704, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0459, + "step": 11310 + }, + { + "epoch": 0.32365975696926375, + "grad_norm": 0.6212049126625061, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.0451, + "step": 11320 + }, + { + "epoch": 0.3239456754824875, + "grad_norm": 0.4887222349643707, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0445, + "step": 11330 + }, + { + "epoch": 0.3242315939957112, + "grad_norm": 0.6692403554916382, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0423, + "step": 11340 + }, + { + "epoch": 0.32451751250893496, + "grad_norm": 0.7166061997413635, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0445, + "step": 11350 + }, + { + "epoch": 0.3248034310221587, + "grad_norm": 0.5342463850975037, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0394, + "step": 11360 + }, + { + "epoch": 0.32508934953538243, + "grad_norm": 1.0617904663085938, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0401, + "step": 11370 + }, + { + "epoch": 0.32537526804860617, + "grad_norm": 0.9869458675384521, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0508, + "step": 11380 + }, + { + "epoch": 0.32566118656182985, + "grad_norm": 0.32021698355674744, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0346, + "step": 11390 + }, + { + "epoch": 0.3259471050750536, + "grad_norm": 0.6566154360771179, + "learning_rate": 1.486814531655139e-05, + "loss": 0.046, + "step": 11400 + }, + { + "epoch": 0.3262330235882773, + "grad_norm": 0.6716777086257935, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.045, + "step": 11410 + }, + { + "epoch": 0.32651894210150106, + "grad_norm": 0.7489042282104492, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0443, + "step": 11420 + }, + { + "epoch": 0.3268048606147248, + "grad_norm": 0.6040313243865967, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0418, + "step": 11430 + }, + { + "epoch": 0.32709077912794854, + "grad_norm": 0.4891999363899231, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0342, + "step": 11440 + }, + { + "epoch": 0.3273766976411723, + "grad_norm": 0.4264339506626129, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0414, + "step": 11450 + }, + { + "epoch": 0.327662616154396, + "grad_norm": 0.5535606741905212, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0362, + "step": 11460 + }, + { + "epoch": 0.32794853466761975, + "grad_norm": 0.566705048084259, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0472, + "step": 11470 + }, + { + "epoch": 0.32823445318084343, + "grad_norm": 0.8539089560508728, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0478, + "step": 11480 + }, + { + "epoch": 0.32852037169406717, + "grad_norm": 0.3981179893016815, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0429, + "step": 11490 + }, + { + "epoch": 0.3288062902072909, + "grad_norm": 1.3788808584213257, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0487, + "step": 11500 + }, + { + "epoch": 0.32909220872051465, + "grad_norm": 0.45551198720932007, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0384, + "step": 11510 + }, + { + "epoch": 0.3293781272337384, + "grad_norm": 0.6321517825126648, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0541, + "step": 11520 + }, + { + "epoch": 0.3296640457469621, + "grad_norm": 0.7971932888031006, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0445, + "step": 11530 + }, + { + "epoch": 0.32994996426018586, + "grad_norm": 0.5022657513618469, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0414, + "step": 11540 + }, + { + "epoch": 0.3302358827734096, + "grad_norm": 0.7302954196929932, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.044, + "step": 11550 + }, + { + "epoch": 0.33052180128663333, + "grad_norm": 0.5123834013938904, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0451, + "step": 11560 + }, + { + "epoch": 0.330807719799857, + "grad_norm": 0.5261625647544861, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.0416, + "step": 11570 + }, + { + "epoch": 0.33109363831308075, + "grad_norm": 0.5782840251922607, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0419, + "step": 11580 + }, + { + "epoch": 0.3313795568263045, + "grad_norm": 0.9754800796508789, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0403, + "step": 11590 + }, + { + "epoch": 0.3316654753395282, + "grad_norm": 0.48157551884651184, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0459, + "step": 11600 + }, + { + "epoch": 0.33195139385275196, + "grad_norm": 0.4394964277744293, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0461, + "step": 11610 + }, + { + "epoch": 0.3322373123659757, + "grad_norm": 1.220790147781372, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0448, + "step": 11620 + }, + { + "epoch": 0.33252323087919944, + "grad_norm": 0.6908231973648071, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0431, + "step": 11630 + }, + { + "epoch": 0.3328091493924232, + "grad_norm": 0.45382779836654663, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0379, + "step": 11640 + }, + { + "epoch": 0.3330950679056469, + "grad_norm": 0.5963619947433472, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.0465, + "step": 11650 + }, + { + "epoch": 0.3333809864188706, + "grad_norm": 0.676210880279541, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0411, + "step": 11660 + }, + { + "epoch": 0.33366690493209433, + "grad_norm": 0.893473744392395, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0443, + "step": 11670 + }, + { + "epoch": 0.33395282344531807, + "grad_norm": 0.30655553936958313, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.04, + "step": 11680 + }, + { + "epoch": 0.3342387419585418, + "grad_norm": 0.899615466594696, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0462, + "step": 11690 + }, + { + "epoch": 0.33452466047176554, + "grad_norm": 0.5037568807601929, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0394, + "step": 11700 + }, + { + "epoch": 0.3348105789849893, + "grad_norm": 0.573716402053833, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0426, + "step": 11710 + }, + { + "epoch": 0.335096497498213, + "grad_norm": 0.4985221326351166, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0422, + "step": 11720 + }, + { + "epoch": 0.33538241601143676, + "grad_norm": 0.8864797353744507, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0504, + "step": 11730 + }, + { + "epoch": 0.3356683345246605, + "grad_norm": 0.49209004640579224, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0409, + "step": 11740 + }, + { + "epoch": 0.3359542530378842, + "grad_norm": 0.5329779982566833, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0468, + "step": 11750 + }, + { + "epoch": 0.3362401715511079, + "grad_norm": 0.7552497386932373, + "learning_rate": 1.454836451908656e-05, + "loss": 0.041, + "step": 11760 + }, + { + "epoch": 0.33652609006433165, + "grad_norm": 0.5737242102622986, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0503, + "step": 11770 + }, + { + "epoch": 0.3368120085775554, + "grad_norm": 0.46150341629981995, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.0399, + "step": 11780 + }, + { + "epoch": 0.3370979270907791, + "grad_norm": 0.55389803647995, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0442, + "step": 11790 + }, + { + "epoch": 0.33738384560400286, + "grad_norm": 0.7647727727890015, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0472, + "step": 11800 + }, + { + "epoch": 0.3376697641172266, + "grad_norm": 0.8755397200584412, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0444, + "step": 11810 + }, + { + "epoch": 0.33795568263045034, + "grad_norm": 0.9257917404174805, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0416, + "step": 11820 + }, + { + "epoch": 0.3382416011436741, + "grad_norm": 0.4048840403556824, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0418, + "step": 11830 + }, + { + "epoch": 0.33852751965689776, + "grad_norm": 0.584200382232666, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0436, + "step": 11840 + }, + { + "epoch": 0.3388134381701215, + "grad_norm": 0.7565616369247437, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0407, + "step": 11850 + }, + { + "epoch": 0.33909935668334523, + "grad_norm": 0.8025793433189392, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0424, + "step": 11860 + }, + { + "epoch": 0.33938527519656897, + "grad_norm": 0.3123756945133209, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.044, + "step": 11870 + }, + { + "epoch": 0.3396711937097927, + "grad_norm": 0.8047941327095032, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0471, + "step": 11880 + }, + { + "epoch": 0.33995711222301644, + "grad_norm": 0.8675779104232788, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0443, + "step": 11890 + }, + { + "epoch": 0.3402430307362402, + "grad_norm": 0.47229406237602234, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0416, + "step": 11900 + }, + { + "epoch": 0.3405289492494639, + "grad_norm": 0.3775595426559448, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0512, + "step": 11910 + }, + { + "epoch": 0.34081486776268766, + "grad_norm": 0.6179372668266296, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0395, + "step": 11920 + }, + { + "epoch": 0.34110078627591134, + "grad_norm": 0.47618359327316284, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0407, + "step": 11930 + }, + { + "epoch": 0.3413867047891351, + "grad_norm": 0.5495609641075134, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.041, + "step": 11940 + }, + { + "epoch": 0.3416726233023588, + "grad_norm": 0.7276089191436768, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0445, + "step": 11950 + }, + { + "epoch": 0.34195854181558255, + "grad_norm": 0.9464111328125, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0471, + "step": 11960 + }, + { + "epoch": 0.3422444603288063, + "grad_norm": 0.8340250253677368, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0488, + "step": 11970 + }, + { + "epoch": 0.34253037884203, + "grad_norm": 0.6392719149589539, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0407, + "step": 11980 + }, + { + "epoch": 0.34281629735525376, + "grad_norm": 0.7563493251800537, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0388, + "step": 11990 + }, + { + "epoch": 0.3431022158684775, + "grad_norm": 0.7145271301269531, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.042, + "step": 12000 + }, + { + "epoch": 0.34338813438170124, + "grad_norm": 0.6522033214569092, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0507, + "step": 12010 + }, + { + "epoch": 0.3436740528949249, + "grad_norm": 0.4634755849838257, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0388, + "step": 12020 + }, + { + "epoch": 0.34395997140814866, + "grad_norm": 0.6681762337684631, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0498, + "step": 12030 + }, + { + "epoch": 0.3442458899213724, + "grad_norm": 0.5068351626396179, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0484, + "step": 12040 + }, + { + "epoch": 0.34453180843459613, + "grad_norm": 0.5424943566322327, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0406, + "step": 12050 + }, + { + "epoch": 0.34481772694781987, + "grad_norm": 0.674436628818512, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.04, + "step": 12060 + }, + { + "epoch": 0.3451036454610436, + "grad_norm": 0.8140727281570435, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0417, + "step": 12070 + }, + { + "epoch": 0.34538956397426734, + "grad_norm": 0.6394575238227844, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0413, + "step": 12080 + }, + { + "epoch": 0.3456754824874911, + "grad_norm": 0.5134334564208984, + "learning_rate": 1.425047976058418e-05, + "loss": 0.04, + "step": 12090 + }, + { + "epoch": 0.3459614010007148, + "grad_norm": 0.6670883297920227, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0397, + "step": 12100 + }, + { + "epoch": 0.3462473195139385, + "grad_norm": 0.49804338812828064, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0431, + "step": 12110 + }, + { + "epoch": 0.34653323802716224, + "grad_norm": 0.33912673592567444, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0492, + "step": 12120 + }, + { + "epoch": 0.346819156540386, + "grad_norm": 0.45478618144989014, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0427, + "step": 12130 + }, + { + "epoch": 0.3471050750536097, + "grad_norm": 0.6690845489501953, + "learning_rate": 1.420497389129506e-05, + "loss": 0.044, + "step": 12140 + }, + { + "epoch": 0.34739099356683345, + "grad_norm": 0.9296556115150452, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.042, + "step": 12150 + }, + { + "epoch": 0.3476769120800572, + "grad_norm": 0.4859760105609894, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0386, + "step": 12160 + }, + { + "epoch": 0.3479628305932809, + "grad_norm": 1.0067541599273682, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0495, + "step": 12170 + }, + { + "epoch": 0.34824874910650466, + "grad_norm": 0.7799471616744995, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0614, + "step": 12180 + }, + { + "epoch": 0.3485346676197284, + "grad_norm": 0.48603832721710205, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0422, + "step": 12190 + }, + { + "epoch": 0.3488205861329521, + "grad_norm": 1.2030225992202759, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0535, + "step": 12200 + }, + { + "epoch": 0.3491065046461758, + "grad_norm": 0.5523782968521118, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0437, + "step": 12210 + }, + { + "epoch": 0.34939242315939956, + "grad_norm": 0.9041968584060669, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0441, + "step": 12220 + }, + { + "epoch": 0.3496783416726233, + "grad_norm": 0.5859020948410034, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.0451, + "step": 12230 + }, + { + "epoch": 0.34996426018584703, + "grad_norm": 0.8736525177955627, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0439, + "step": 12240 + }, + { + "epoch": 0.35025017869907077, + "grad_norm": 0.4692678153514862, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0516, + "step": 12250 + }, + { + "epoch": 0.3505360972122945, + "grad_norm": 0.6326560974121094, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0427, + "step": 12260 + }, + { + "epoch": 0.35082201572551824, + "grad_norm": 0.6265914440155029, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0392, + "step": 12270 + }, + { + "epoch": 0.351107934238742, + "grad_norm": 0.8684681057929993, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0416, + "step": 12280 + }, + { + "epoch": 0.35139385275196566, + "grad_norm": 0.6076116561889648, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0405, + "step": 12290 + }, + { + "epoch": 0.3516797712651894, + "grad_norm": 0.36192813515663147, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0417, + "step": 12300 + }, + { + "epoch": 0.35196568977841314, + "grad_norm": 0.5561486482620239, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0397, + "step": 12310 + }, + { + "epoch": 0.3522516082916369, + "grad_norm": 0.5955346822738647, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0332, + "step": 12320 + }, + { + "epoch": 0.3525375268048606, + "grad_norm": 0.4861294627189636, + "learning_rate": 1.403120543105273e-05, + "loss": 0.0423, + "step": 12330 + }, + { + "epoch": 0.35282344531808435, + "grad_norm": 0.920704185962677, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0467, + "step": 12340 + }, + { + "epoch": 0.3531093638313081, + "grad_norm": 0.4749159514904022, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0425, + "step": 12350 + }, + { + "epoch": 0.3533952823445318, + "grad_norm": 0.5075432658195496, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0362, + "step": 12360 + }, + { + "epoch": 0.35368120085775556, + "grad_norm": 0.3057022988796234, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0378, + "step": 12370 + }, + { + "epoch": 0.35396711937097924, + "grad_norm": 0.48122167587280273, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0359, + "step": 12380 + }, + { + "epoch": 0.354253037884203, + "grad_norm": 0.39227673411369324, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0432, + "step": 12390 + }, + { + "epoch": 0.3545389563974267, + "grad_norm": 0.641839861869812, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0422, + "step": 12400 + }, + { + "epoch": 0.35482487491065046, + "grad_norm": 1.0422887802124023, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0445, + "step": 12410 + }, + { + "epoch": 0.3551107934238742, + "grad_norm": 0.5336428880691528, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0408, + "step": 12420 + }, + { + "epoch": 0.35539671193709793, + "grad_norm": 0.6634368896484375, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0374, + "step": 12430 + }, + { + "epoch": 0.35568263045032167, + "grad_norm": 0.5840758085250854, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0417, + "step": 12440 + }, + { + "epoch": 0.3559685489635454, + "grad_norm": 0.8465530872344971, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0449, + "step": 12450 + }, + { + "epoch": 0.35625446747676914, + "grad_norm": 0.48737838864326477, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0439, + "step": 12460 + }, + { + "epoch": 0.3565403859899928, + "grad_norm": 1.2267687320709229, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0395, + "step": 12470 + }, + { + "epoch": 0.35682630450321656, + "grad_norm": 0.4097842276096344, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0379, + "step": 12480 + }, + { + "epoch": 0.3571122230164403, + "grad_norm": 0.8895343542098999, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0415, + "step": 12490 + }, + { + "epoch": 0.35739814152966404, + "grad_norm": 0.6732933521270752, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0432, + "step": 12500 + }, + { + "epoch": 0.3576840600428878, + "grad_norm": 0.4521937966346741, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0442, + "step": 12510 + }, + { + "epoch": 0.3579699785561115, + "grad_norm": 0.5932701826095581, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0407, + "step": 12520 + }, + { + "epoch": 0.35825589706933525, + "grad_norm": 0.5595138669013977, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0387, + "step": 12530 + }, + { + "epoch": 0.358541815582559, + "grad_norm": 0.7205538153648376, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0393, + "step": 12540 + }, + { + "epoch": 0.3588277340957827, + "grad_norm": 0.4069580137729645, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0554, + "step": 12550 + }, + { + "epoch": 0.3591136526090064, + "grad_norm": 0.4881740212440491, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.0411, + "step": 12560 + }, + { + "epoch": 0.35939957112223014, + "grad_norm": 0.7710328102111816, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.043, + "step": 12570 + }, + { + "epoch": 0.3596854896354539, + "grad_norm": 0.6593908071517944, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.046, + "step": 12580 + }, + { + "epoch": 0.3599714081486776, + "grad_norm": 0.6712149977684021, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0392, + "step": 12590 + }, + { + "epoch": 0.36025732666190136, + "grad_norm": 0.6103658080101013, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.0482, + "step": 12600 + }, + { + "epoch": 0.3605432451751251, + "grad_norm": 0.5170528292655945, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0441, + "step": 12610 + }, + { + "epoch": 0.36082916368834883, + "grad_norm": 0.47434374690055847, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0436, + "step": 12620 + }, + { + "epoch": 0.36111508220157257, + "grad_norm": 0.6546452045440674, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0441, + "step": 12630 + }, + { + "epoch": 0.3614010007147963, + "grad_norm": 1.3334686756134033, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0464, + "step": 12640 + }, + { + "epoch": 0.36168691922802, + "grad_norm": 1.3882309198379517, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.0527, + "step": 12650 + }, + { + "epoch": 0.3619728377412437, + "grad_norm": 0.829872190952301, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0499, + "step": 12660 + }, + { + "epoch": 0.36225875625446746, + "grad_norm": 0.6917227506637573, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0513, + "step": 12670 + }, + { + "epoch": 0.3625446747676912, + "grad_norm": 0.3825722634792328, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0455, + "step": 12680 + }, + { + "epoch": 0.36283059328091494, + "grad_norm": 0.7726976275444031, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0366, + "step": 12690 + }, + { + "epoch": 0.3631165117941387, + "grad_norm": 0.48851099610328674, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0363, + "step": 12700 + }, + { + "epoch": 0.3634024303073624, + "grad_norm": 0.5034362077713013, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0461, + "step": 12710 + }, + { + "epoch": 0.36368834882058615, + "grad_norm": 0.8411096334457397, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0448, + "step": 12720 + }, + { + "epoch": 0.3639742673338099, + "grad_norm": 0.7185337543487549, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0366, + "step": 12730 + }, + { + "epoch": 0.36426018584703357, + "grad_norm": 0.5850857496261597, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0414, + "step": 12740 + }, + { + "epoch": 0.3645461043602573, + "grad_norm": 0.47304606437683105, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0464, + "step": 12750 + }, + { + "epoch": 0.36483202287348104, + "grad_norm": 0.7190109491348267, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0418, + "step": 12760 + }, + { + "epoch": 0.3651179413867048, + "grad_norm": 0.8053406476974487, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0407, + "step": 12770 + }, + { + "epoch": 0.3654038598999285, + "grad_norm": 0.8875076174736023, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0471, + "step": 12780 + }, + { + "epoch": 0.36568977841315226, + "grad_norm": 0.5206999182701111, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0478, + "step": 12790 + }, + { + "epoch": 0.365975696926376, + "grad_norm": 0.5034269690513611, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0422, + "step": 12800 + }, + { + "epoch": 0.36626161543959973, + "grad_norm": 0.9846853017807007, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.045, + "step": 12810 + }, + { + "epoch": 0.36654753395282347, + "grad_norm": 0.49341151118278503, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0471, + "step": 12820 + }, + { + "epoch": 0.36683345246604715, + "grad_norm": 0.765583336353302, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0411, + "step": 12830 + }, + { + "epoch": 0.3671193709792709, + "grad_norm": 0.5193378925323486, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.0522, + "step": 12840 + }, + { + "epoch": 0.3674052894924946, + "grad_norm": 0.8142374157905579, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0374, + "step": 12850 + }, + { + "epoch": 0.36769120800571836, + "grad_norm": 0.7233540415763855, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0516, + "step": 12860 + }, + { + "epoch": 0.3679771265189421, + "grad_norm": 0.38758793473243713, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0437, + "step": 12870 + }, + { + "epoch": 0.36826304503216584, + "grad_norm": 0.36923956871032715, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.041, + "step": 12880 + }, + { + "epoch": 0.3685489635453896, + "grad_norm": 1.0518147945404053, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0446, + "step": 12890 + }, + { + "epoch": 0.3688348820586133, + "grad_norm": 0.5833591818809509, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0362, + "step": 12900 + }, + { + "epoch": 0.36912080057183705, + "grad_norm": 0.6178849339485168, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.041, + "step": 12910 + }, + { + "epoch": 0.36940671908506073, + "grad_norm": 0.7599044442176819, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0473, + "step": 12920 + }, + { + "epoch": 0.36969263759828447, + "grad_norm": 0.7787651419639587, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0461, + "step": 12930 + }, + { + "epoch": 0.3699785561115082, + "grad_norm": 0.3847586512565613, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0413, + "step": 12940 + }, + { + "epoch": 0.37026447462473194, + "grad_norm": 0.6218805313110352, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0424, + "step": 12950 + }, + { + "epoch": 0.3705503931379557, + "grad_norm": 0.6770363450050354, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.0426, + "step": 12960 + }, + { + "epoch": 0.3708363116511794, + "grad_norm": 0.6817107796669006, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.041, + "step": 12970 + }, + { + "epoch": 0.37112223016440316, + "grad_norm": 1.6997944116592407, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0626, + "step": 12980 + }, + { + "epoch": 0.3714081486776269, + "grad_norm": 0.4540708363056183, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0356, + "step": 12990 + }, + { + "epoch": 0.37169406719085063, + "grad_norm": 0.4272336959838867, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0354, + "step": 13000 + }, + { + "epoch": 0.3719799857040743, + "grad_norm": 0.4723891019821167, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0438, + "step": 13010 + }, + { + "epoch": 0.37226590421729805, + "grad_norm": 0.5508099794387817, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.042, + "step": 13020 + }, + { + "epoch": 0.3725518227305218, + "grad_norm": 1.05836021900177, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0472, + "step": 13030 + }, + { + "epoch": 0.3728377412437455, + "grad_norm": 0.4397801458835602, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0462, + "step": 13040 + }, + { + "epoch": 0.37312365975696926, + "grad_norm": 0.3131158649921417, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0383, + "step": 13050 + }, + { + "epoch": 0.373409578270193, + "grad_norm": 0.5489990711212158, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0398, + "step": 13060 + }, + { + "epoch": 0.37369549678341674, + "grad_norm": 0.7425751686096191, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.0416, + "step": 13070 + }, + { + "epoch": 0.3739814152966405, + "grad_norm": 0.6337125301361084, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0387, + "step": 13080 + }, + { + "epoch": 0.3742673338098642, + "grad_norm": 0.656467854976654, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0431, + "step": 13090 + }, + { + "epoch": 0.3745532523230879, + "grad_norm": 0.7011964321136475, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0487, + "step": 13100 + }, + { + "epoch": 0.37483917083631163, + "grad_norm": 0.4949609041213989, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0429, + "step": 13110 + }, + { + "epoch": 0.37512508934953537, + "grad_norm": 0.6796516180038452, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0405, + "step": 13120 + }, + { + "epoch": 0.3754110078627591, + "grad_norm": 0.41161492466926575, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0359, + "step": 13130 + }, + { + "epoch": 0.37569692637598284, + "grad_norm": 0.4463254511356354, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0353, + "step": 13140 + }, + { + "epoch": 0.3759828448892066, + "grad_norm": 0.4082377254962921, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.047, + "step": 13150 + }, + { + "epoch": 0.3762687634024303, + "grad_norm": 0.7927104830741882, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0484, + "step": 13160 + }, + { + "epoch": 0.37655468191565405, + "grad_norm": 0.5212385058403015, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.041, + "step": 13170 + }, + { + "epoch": 0.3768406004288778, + "grad_norm": 0.7408128380775452, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0462, + "step": 13180 + }, + { + "epoch": 0.3771265189421015, + "grad_norm": 0.3847906291484833, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0361, + "step": 13190 + }, + { + "epoch": 0.3774124374553252, + "grad_norm": 0.5039756298065186, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0385, + "step": 13200 + }, + { + "epoch": 0.37769835596854895, + "grad_norm": 0.5682945251464844, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0369, + "step": 13210 + }, + { + "epoch": 0.3779842744817727, + "grad_norm": 0.5985261797904968, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0376, + "step": 13220 + }, + { + "epoch": 0.3782701929949964, + "grad_norm": 0.7080312967300415, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0514, + "step": 13230 + }, + { + "epoch": 0.37855611150822016, + "grad_norm": 0.7488406300544739, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0421, + "step": 13240 + }, + { + "epoch": 0.3788420300214439, + "grad_norm": 0.38066044449806213, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0411, + "step": 13250 + }, + { + "epoch": 0.37912794853466764, + "grad_norm": 0.6335283517837524, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.0526, + "step": 13260 + }, + { + "epoch": 0.3794138670478914, + "grad_norm": 0.7008160352706909, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0402, + "step": 13270 + }, + { + "epoch": 0.37969978556111506, + "grad_norm": 0.4219777286052704, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.039, + "step": 13280 + }, + { + "epoch": 0.3799857040743388, + "grad_norm": 0.6447705030441284, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.0412, + "step": 13290 + }, + { + "epoch": 0.38027162258756253, + "grad_norm": 0.4625374674797058, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0437, + "step": 13300 + }, + { + "epoch": 0.38055754110078627, + "grad_norm": 0.4056257903575897, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0377, + "step": 13310 + }, + { + "epoch": 0.38084345961401, + "grad_norm": 0.425281286239624, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0378, + "step": 13320 + }, + { + "epoch": 0.38112937812723374, + "grad_norm": 0.4031837582588196, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0361, + "step": 13330 + }, + { + "epoch": 0.3814152966404575, + "grad_norm": 0.469175785779953, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0391, + "step": 13340 + }, + { + "epoch": 0.3817012151536812, + "grad_norm": 0.36555227637290955, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0352, + "step": 13350 + }, + { + "epoch": 0.38198713366690495, + "grad_norm": 0.8802763819694519, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0412, + "step": 13360 + }, + { + "epoch": 0.38227305218012864, + "grad_norm": 0.5733079314231873, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0418, + "step": 13370 + }, + { + "epoch": 0.3825589706933524, + "grad_norm": 0.606238842010498, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0518, + "step": 13380 + }, + { + "epoch": 0.3828448892065761, + "grad_norm": 0.5096673369407654, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.0404, + "step": 13390 + }, + { + "epoch": 0.38313080771979985, + "grad_norm": 0.8240867853164673, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0513, + "step": 13400 + }, + { + "epoch": 0.3834167262330236, + "grad_norm": 0.3757685422897339, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0407, + "step": 13410 + }, + { + "epoch": 0.3837026447462473, + "grad_norm": 0.4560941755771637, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.0429, + "step": 13420 + }, + { + "epoch": 0.38398856325947106, + "grad_norm": 0.42831951379776, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0387, + "step": 13430 + }, + { + "epoch": 0.3842744817726948, + "grad_norm": 0.8373785614967346, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0473, + "step": 13440 + }, + { + "epoch": 0.38456040028591854, + "grad_norm": 0.9560670256614685, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0442, + "step": 13450 + }, + { + "epoch": 0.3848463187991422, + "grad_norm": 0.4101570248603821, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0429, + "step": 13460 + }, + { + "epoch": 0.38513223731236595, + "grad_norm": 0.673739492893219, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0525, + "step": 13470 + }, + { + "epoch": 0.3854181558255897, + "grad_norm": 1.126909852027893, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0499, + "step": 13480 + }, + { + "epoch": 0.38570407433881343, + "grad_norm": 0.571437656879425, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0431, + "step": 13490 + }, + { + "epoch": 0.38598999285203717, + "grad_norm": 0.5121229887008667, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0419, + "step": 13500 + }, + { + "epoch": 0.3862759113652609, + "grad_norm": 0.6143786907196045, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0373, + "step": 13510 + }, + { + "epoch": 0.38656182987848464, + "grad_norm": 0.395014226436615, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0457, + "step": 13520 + }, + { + "epoch": 0.3868477483917084, + "grad_norm": 0.46027693152427673, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0372, + "step": 13530 + }, + { + "epoch": 0.3871336669049321, + "grad_norm": 0.42744559049606323, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0417, + "step": 13540 + }, + { + "epoch": 0.3874195854181558, + "grad_norm": 0.4765837490558624, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0442, + "step": 13550 + }, + { + "epoch": 0.38770550393137954, + "grad_norm": 0.9767054319381714, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0397, + "step": 13560 + }, + { + "epoch": 0.3879914224446033, + "grad_norm": 0.5535935759544373, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0423, + "step": 13570 + }, + { + "epoch": 0.388277340957827, + "grad_norm": 0.3802829384803772, + "learning_rate": 1.285944160290905e-05, + "loss": 0.0329, + "step": 13580 + }, + { + "epoch": 0.38856325947105075, + "grad_norm": 0.6564178466796875, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0423, + "step": 13590 + }, + { + "epoch": 0.3888491779842745, + "grad_norm": 0.4400223195552826, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0356, + "step": 13600 + }, + { + "epoch": 0.3891350964974982, + "grad_norm": 0.4441612958908081, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0576, + "step": 13610 + }, + { + "epoch": 0.38942101501072196, + "grad_norm": 0.5270922780036926, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0406, + "step": 13620 + }, + { + "epoch": 0.3897069335239457, + "grad_norm": 0.6497722268104553, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0389, + "step": 13630 + }, + { + "epoch": 0.3899928520371694, + "grad_norm": 0.628182053565979, + "learning_rate": 1.280216624157504e-05, + "loss": 0.049, + "step": 13640 + }, + { + "epoch": 0.3902787705503931, + "grad_norm": 0.5242640376091003, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0389, + "step": 13650 + }, + { + "epoch": 0.39056468906361685, + "grad_norm": 0.5140895843505859, + "learning_rate": 1.278305741539386e-05, + "loss": 0.047, + "step": 13660 + }, + { + "epoch": 0.3908506075768406, + "grad_norm": 0.531012773513794, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0415, + "step": 13670 + }, + { + "epoch": 0.39113652609006433, + "grad_norm": 0.5066007375717163, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0411, + "step": 13680 + }, + { + "epoch": 0.39142244460328807, + "grad_norm": 1.0783177614212036, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0371, + "step": 13690 + }, + { + "epoch": 0.3917083631165118, + "grad_norm": 0.592755913734436, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0402, + "step": 13700 + }, + { + "epoch": 0.39199428162973554, + "grad_norm": 0.5595790147781372, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0543, + "step": 13710 + }, + { + "epoch": 0.3922802001429593, + "grad_norm": 0.5388237237930298, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0487, + "step": 13720 + }, + { + "epoch": 0.39256611865618296, + "grad_norm": 0.5311065316200256, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0389, + "step": 13730 + }, + { + "epoch": 0.3928520371694067, + "grad_norm": 0.8037494421005249, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0456, + "step": 13740 + }, + { + "epoch": 0.39313795568263044, + "grad_norm": 0.851921796798706, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0389, + "step": 13750 + }, + { + "epoch": 0.3934238741958542, + "grad_norm": 0.5924596190452576, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0401, + "step": 13760 + }, + { + "epoch": 0.3937097927090779, + "grad_norm": 0.5660725831985474, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0443, + "step": 13770 + }, + { + "epoch": 0.39399571122230165, + "grad_norm": 0.4110502004623413, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0438, + "step": 13780 + }, + { + "epoch": 0.3942816297355254, + "grad_norm": 0.7104408144950867, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.042, + "step": 13790 + }, + { + "epoch": 0.3945675482487491, + "grad_norm": 0.5490137338638306, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0477, + "step": 13800 + }, + { + "epoch": 0.39485346676197286, + "grad_norm": 0.4189203083515167, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0446, + "step": 13810 + }, + { + "epoch": 0.39513938527519654, + "grad_norm": 3.620929479598999, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0541, + "step": 13820 + }, + { + "epoch": 0.3954253037884203, + "grad_norm": 0.4670915901660919, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0391, + "step": 13830 + }, + { + "epoch": 0.395711222301644, + "grad_norm": 0.4475649297237396, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.04, + "step": 13840 + }, + { + "epoch": 0.39599714081486775, + "grad_norm": 0.4646693170070648, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0412, + "step": 13850 + }, + { + "epoch": 0.3962830593280915, + "grad_norm": 0.4141371250152588, + "learning_rate": 1.259152361972498e-05, + "loss": 0.039, + "step": 13860 + }, + { + "epoch": 0.39656897784131523, + "grad_norm": 0.7549411058425903, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0414, + "step": 13870 + }, + { + "epoch": 0.39685489635453897, + "grad_norm": 0.5687856078147888, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0441, + "step": 13880 + }, + { + "epoch": 0.3971408148677627, + "grad_norm": 0.582946240901947, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0451, + "step": 13890 + }, + { + "epoch": 0.39742673338098644, + "grad_norm": 0.6410595178604126, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0362, + "step": 13900 + }, + { + "epoch": 0.3977126518942101, + "grad_norm": 0.4375670850276947, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.0552, + "step": 13910 + }, + { + "epoch": 0.39799857040743386, + "grad_norm": 0.5675646662712097, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0373, + "step": 13920 + }, + { + "epoch": 0.3982844889206576, + "grad_norm": 0.544170618057251, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0449, + "step": 13930 + }, + { + "epoch": 0.39857040743388134, + "grad_norm": 0.44928276538848877, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0461, + "step": 13940 + }, + { + "epoch": 0.3988563259471051, + "grad_norm": 0.511382520198822, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0413, + "step": 13950 + }, + { + "epoch": 0.3991422444603288, + "grad_norm": 0.38443753123283386, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0374, + "step": 13960 + }, + { + "epoch": 0.39942816297355255, + "grad_norm": 0.5726080536842346, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0553, + "step": 13970 + }, + { + "epoch": 0.3997140814867763, + "grad_norm": 0.554694414138794, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0404, + "step": 13980 + }, + { + "epoch": 0.4, + "grad_norm": 0.4891316592693329, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.0418, + "step": 13990 + }, + { + "epoch": 0.4002859185132237, + "grad_norm": 0.5150312781333923, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0418, + "step": 14000 + }, + { + "epoch": 0.40057183702644744, + "grad_norm": 0.9077253937721252, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0415, + "step": 14010 + }, + { + "epoch": 0.4008577555396712, + "grad_norm": 0.9126781225204468, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.048, + "step": 14020 + }, + { + "epoch": 0.4011436740528949, + "grad_norm": 0.6264623999595642, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0411, + "step": 14030 + }, + { + "epoch": 0.40142959256611865, + "grad_norm": 0.523853600025177, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.051, + "step": 14040 + }, + { + "epoch": 0.4017155110793424, + "grad_norm": 0.6340035200119019, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0426, + "step": 14050 + }, + { + "epoch": 0.40200142959256613, + "grad_norm": 0.3594725430011749, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0397, + "step": 14060 + }, + { + "epoch": 0.40228734810578987, + "grad_norm": 0.941470742225647, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0402, + "step": 14070 + }, + { + "epoch": 0.4025732666190136, + "grad_norm": 0.840506911277771, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0473, + "step": 14080 + }, + { + "epoch": 0.4028591851322373, + "grad_norm": 0.3359200954437256, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0405, + "step": 14090 + }, + { + "epoch": 0.403145103645461, + "grad_norm": 0.49658629298210144, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0464, + "step": 14100 + }, + { + "epoch": 0.40343102215868476, + "grad_norm": 0.7940187454223633, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0417, + "step": 14110 + }, + { + "epoch": 0.4037169406719085, + "grad_norm": 0.30110660195350647, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0371, + "step": 14120 + }, + { + "epoch": 0.40400285918513223, + "grad_norm": 0.42845240235328674, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.053, + "step": 14130 + }, + { + "epoch": 0.40428877769835597, + "grad_norm": 0.997348427772522, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.041, + "step": 14140 + }, + { + "epoch": 0.4045746962115797, + "grad_norm": 0.4759966731071472, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0377, + "step": 14150 + }, + { + "epoch": 0.40486061472480345, + "grad_norm": 0.42045602202415466, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0397, + "step": 14160 + }, + { + "epoch": 0.4051465332380272, + "grad_norm": 0.6400002837181091, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0507, + "step": 14170 + }, + { + "epoch": 0.40543245175125087, + "grad_norm": 0.5473673939704895, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0359, + "step": 14180 + }, + { + "epoch": 0.4057183702644746, + "grad_norm": 0.7414730787277222, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0416, + "step": 14190 + }, + { + "epoch": 0.40600428877769834, + "grad_norm": 0.4691861867904663, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0363, + "step": 14200 + }, + { + "epoch": 0.4062902072909221, + "grad_norm": 0.9186112880706787, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0445, + "step": 14210 + }, + { + "epoch": 0.4065761258041458, + "grad_norm": 0.6782190203666687, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.40686204431736955, + "grad_norm": 0.6948013305664062, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.037, + "step": 14230 + }, + { + "epoch": 0.4071479628305933, + "grad_norm": 0.3034680485725403, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0371, + "step": 14240 + }, + { + "epoch": 0.40743388134381703, + "grad_norm": 0.4254174828529358, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0449, + "step": 14250 + }, + { + "epoch": 0.40771979985704077, + "grad_norm": 1.3622064590454102, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0428, + "step": 14260 + }, + { + "epoch": 0.40800571837026445, + "grad_norm": 0.5928359031677246, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.0424, + "step": 14270 + }, + { + "epoch": 0.4082916368834882, + "grad_norm": 0.9103132486343384, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0414, + "step": 14280 + }, + { + "epoch": 0.4085775553967119, + "grad_norm": 0.6338028311729431, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0376, + "step": 14290 + }, + { + "epoch": 0.40886347390993566, + "grad_norm": 0.9920284748077393, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0393, + "step": 14300 + }, + { + "epoch": 0.4091493924231594, + "grad_norm": 0.411830335855484, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0336, + "step": 14310 + }, + { + "epoch": 0.40943531093638313, + "grad_norm": 0.6977682709693909, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.0454, + "step": 14320 + }, + { + "epoch": 0.40972122944960687, + "grad_norm": 0.6303663849830627, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0453, + "step": 14330 + }, + { + "epoch": 0.4100071479628306, + "grad_norm": 0.3048207759857178, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0373, + "step": 14340 + }, + { + "epoch": 0.41029306647605435, + "grad_norm": 0.7683395743370056, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0438, + "step": 14350 + }, + { + "epoch": 0.41057898498927803, + "grad_norm": 0.5791511535644531, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0392, + "step": 14360 + }, + { + "epoch": 0.41086490350250177, + "grad_norm": 0.876626193523407, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0324, + "step": 14370 + }, + { + "epoch": 0.4111508220157255, + "grad_norm": 0.5971815586090088, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0368, + "step": 14380 + }, + { + "epoch": 0.41143674052894924, + "grad_norm": 0.6508862376213074, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0411, + "step": 14390 + }, + { + "epoch": 0.411722659042173, + "grad_norm": 0.4704359471797943, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.0351, + "step": 14400 + }, + { + "epoch": 0.4120085775553967, + "grad_norm": 0.4266453683376312, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0367, + "step": 14410 + }, + { + "epoch": 0.41229449606862045, + "grad_norm": 0.5898434519767761, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0376, + "step": 14420 + }, + { + "epoch": 0.4125804145818442, + "grad_norm": 0.8741532564163208, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0419, + "step": 14430 + }, + { + "epoch": 0.41286633309506793, + "grad_norm": 0.24328190088272095, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0333, + "step": 14440 + }, + { + "epoch": 0.4131522516082916, + "grad_norm": 0.4263601303100586, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.039, + "step": 14450 + }, + { + "epoch": 0.41343817012151535, + "grad_norm": 0.6311615109443665, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0454, + "step": 14460 + }, + { + "epoch": 0.4137240886347391, + "grad_norm": 0.7424519658088684, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0392, + "step": 14470 + }, + { + "epoch": 0.4140100071479628, + "grad_norm": 0.48323145508766174, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0374, + "step": 14480 + }, + { + "epoch": 0.41429592566118656, + "grad_norm": 0.38597407937049866, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0393, + "step": 14490 + }, + { + "epoch": 0.4145818441744103, + "grad_norm": 0.7251518964767456, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0431, + "step": 14500 + }, + { + "epoch": 0.41486776268763403, + "grad_norm": 0.44361060857772827, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0426, + "step": 14510 + }, + { + "epoch": 0.41515368120085777, + "grad_norm": 0.5625014305114746, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0372, + "step": 14520 + }, + { + "epoch": 0.4154395997140815, + "grad_norm": 0.27855798602104187, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0356, + "step": 14530 + }, + { + "epoch": 0.4157255182273052, + "grad_norm": 0.5966296195983887, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0387, + "step": 14540 + }, + { + "epoch": 0.41601143674052893, + "grad_norm": 0.49445512890815735, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0355, + "step": 14550 + }, + { + "epoch": 0.41629735525375267, + "grad_norm": 0.3813278377056122, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0456, + "step": 14560 + }, + { + "epoch": 0.4165832737669764, + "grad_norm": 0.5962988138198853, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0401, + "step": 14570 + }, + { + "epoch": 0.41686919228020014, + "grad_norm": 0.4028547406196594, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0371, + "step": 14580 + }, + { + "epoch": 0.4171551107934239, + "grad_norm": 1.348706841468811, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0426, + "step": 14590 + }, + { + "epoch": 0.4174410293066476, + "grad_norm": 1.2782070636749268, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0393, + "step": 14600 + }, + { + "epoch": 0.41772694781987135, + "grad_norm": 1.0024999380111694, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0436, + "step": 14610 + }, + { + "epoch": 0.4180128663330951, + "grad_norm": 0.35450127720832825, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0411, + "step": 14620 + }, + { + "epoch": 0.41829878484631877, + "grad_norm": 0.5827250480651855, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0372, + "step": 14630 + }, + { + "epoch": 0.4185847033595425, + "grad_norm": 0.5905774235725403, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0394, + "step": 14640 + }, + { + "epoch": 0.41887062187276625, + "grad_norm": 0.652074933052063, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0405, + "step": 14650 + }, + { + "epoch": 0.41915654038599, + "grad_norm": 0.7245490550994873, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0473, + "step": 14660 + }, + { + "epoch": 0.4194424588992137, + "grad_norm": 0.5153012871742249, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.043, + "step": 14670 + }, + { + "epoch": 0.41972837741243746, + "grad_norm": 0.516107976436615, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0434, + "step": 14680 + }, + { + "epoch": 0.4200142959256612, + "grad_norm": 0.4743354618549347, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0429, + "step": 14690 + }, + { + "epoch": 0.42030021443888493, + "grad_norm": 0.547875165939331, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0395, + "step": 14700 + }, + { + "epoch": 0.42058613295210867, + "grad_norm": 0.6398400068283081, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.0384, + "step": 14710 + }, + { + "epoch": 0.42087205146533235, + "grad_norm": 0.5891467332839966, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0399, + "step": 14720 + }, + { + "epoch": 0.4211579699785561, + "grad_norm": 0.3927595615386963, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0353, + "step": 14730 + }, + { + "epoch": 0.42144388849177983, + "grad_norm": 0.6477030515670776, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0492, + "step": 14740 + }, + { + "epoch": 0.42172980700500357, + "grad_norm": 0.7090615034103394, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.042, + "step": 14750 + }, + { + "epoch": 0.4220157255182273, + "grad_norm": 0.6572134494781494, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0406, + "step": 14760 + }, + { + "epoch": 0.42230164403145104, + "grad_norm": 0.787663996219635, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0424, + "step": 14770 + }, + { + "epoch": 0.4225875625446748, + "grad_norm": 0.8419309258460999, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0427, + "step": 14780 + }, + { + "epoch": 0.4228734810578985, + "grad_norm": 0.6204128861427307, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0364, + "step": 14790 + }, + { + "epoch": 0.42315939957112225, + "grad_norm": 0.7446070313453674, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0391, + "step": 14800 + }, + { + "epoch": 0.42344531808434593, + "grad_norm": 0.7446451783180237, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0384, + "step": 14810 + }, + { + "epoch": 0.42373123659756967, + "grad_norm": 0.6946475505828857, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0375, + "step": 14820 + }, + { + "epoch": 0.4240171551107934, + "grad_norm": 0.6997008323669434, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0393, + "step": 14830 + }, + { + "epoch": 0.42430307362401715, + "grad_norm": 0.4857316315174103, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0474, + "step": 14840 + }, + { + "epoch": 0.4245889921372409, + "grad_norm": 1.3516888618469238, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.047, + "step": 14850 + }, + { + "epoch": 0.4248749106504646, + "grad_norm": 0.40320220589637756, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0418, + "step": 14860 + }, + { + "epoch": 0.42516082916368836, + "grad_norm": 0.9002796411514282, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0434, + "step": 14870 + }, + { + "epoch": 0.4254467476769121, + "grad_norm": 0.3810071349143982, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.0338, + "step": 14880 + }, + { + "epoch": 0.42573266619013583, + "grad_norm": 0.5786157250404358, + "learning_rate": 1.159527607963768e-05, + "loss": 0.037, + "step": 14890 + }, + { + "epoch": 0.4260185847033595, + "grad_norm": 0.6316869258880615, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.0388, + "step": 14900 + }, + { + "epoch": 0.42630450321658325, + "grad_norm": 0.608745276927948, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0426, + "step": 14910 + }, + { + "epoch": 0.426590421729807, + "grad_norm": 0.6655036807060242, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0433, + "step": 14920 + }, + { + "epoch": 0.4268763402430307, + "grad_norm": 0.29059523344039917, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0507, + "step": 14930 + }, + { + "epoch": 0.42716225875625446, + "grad_norm": 0.9066076278686523, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0447, + "step": 14940 + }, + { + "epoch": 0.4274481772694782, + "grad_norm": 1.0660220384597778, + "learning_rate": 1.153689339251154e-05, + "loss": 0.0512, + "step": 14950 + }, + { + "epoch": 0.42773409578270194, + "grad_norm": 0.6081144213676453, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0426, + "step": 14960 + }, + { + "epoch": 0.4280200142959257, + "grad_norm": 0.46524369716644287, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0435, + "step": 14970 + }, + { + "epoch": 0.4283059328091494, + "grad_norm": 0.3497388958930969, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.0492, + "step": 14980 + }, + { + "epoch": 0.4285918513223731, + "grad_norm": 0.41300803422927856, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.034, + "step": 14990 + }, + { + "epoch": 0.42887776983559683, + "grad_norm": 0.4363289177417755, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0358, + "step": 15000 + }, + { + "epoch": 0.42916368834882057, + "grad_norm": 1.314915418624878, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.047, + "step": 15010 + }, + { + "epoch": 0.4294496068620443, + "grad_norm": 0.558199942111969, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0313, + "step": 15020 + }, + { + "epoch": 0.42973552537526805, + "grad_norm": 0.3857463598251343, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0416, + "step": 15030 + }, + { + "epoch": 0.4300214438884918, + "grad_norm": 0.4701749384403229, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0425, + "step": 15040 + }, + { + "epoch": 0.4303073624017155, + "grad_norm": 0.4611213803291321, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0457, + "step": 15050 + }, + { + "epoch": 0.43059328091493926, + "grad_norm": 0.5338016152381897, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.038, + "step": 15060 + }, + { + "epoch": 0.430879199428163, + "grad_norm": 0.9078943133354187, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.0395, + "step": 15070 + }, + { + "epoch": 0.4311651179413867, + "grad_norm": 0.5354048013687134, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0403, + "step": 15080 + }, + { + "epoch": 0.4314510364546104, + "grad_norm": 0.35511279106140137, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0377, + "step": 15090 + }, + { + "epoch": 0.43173695496783415, + "grad_norm": 0.37104350328445435, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0426, + "step": 15100 + }, + { + "epoch": 0.4320228734810579, + "grad_norm": 0.8916210532188416, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0387, + "step": 15110 + }, + { + "epoch": 0.4323087919942816, + "grad_norm": 0.514994740486145, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0384, + "step": 15120 + }, + { + "epoch": 0.43259471050750536, + "grad_norm": 0.8440690040588379, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0437, + "step": 15130 + }, + { + "epoch": 0.4328806290207291, + "grad_norm": 0.6815949082374573, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0453, + "step": 15140 + }, + { + "epoch": 0.43316654753395284, + "grad_norm": 0.33178189396858215, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0351, + "step": 15150 + }, + { + "epoch": 0.4334524660471766, + "grad_norm": 0.5686727166175842, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0368, + "step": 15160 + }, + { + "epoch": 0.43373838456040026, + "grad_norm": 0.44143930077552795, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0443, + "step": 15170 + }, + { + "epoch": 0.434024303073624, + "grad_norm": 0.3238232135772705, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0348, + "step": 15180 + }, + { + "epoch": 0.43431022158684773, + "grad_norm": 0.5038242340087891, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0343, + "step": 15190 + }, + { + "epoch": 0.43459614010007147, + "grad_norm": 0.4904351234436035, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0397, + "step": 15200 + }, + { + "epoch": 0.4348820586132952, + "grad_norm": 0.5325750708580017, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0499, + "step": 15210 + }, + { + "epoch": 0.43516797712651895, + "grad_norm": 0.39443954825401306, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.044, + "step": 15220 + }, + { + "epoch": 0.4354538956397427, + "grad_norm": 0.6782003045082092, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0358, + "step": 15230 + }, + { + "epoch": 0.4357398141529664, + "grad_norm": 0.47862571477890015, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0418, + "step": 15240 + }, + { + "epoch": 0.43602573266619016, + "grad_norm": 1.6515535116195679, + "learning_rate": 1.124468908014616e-05, + "loss": 0.043, + "step": 15250 + }, + { + "epoch": 0.43631165117941384, + "grad_norm": 0.4902660846710205, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0371, + "step": 15260 + }, + { + "epoch": 0.4365975696926376, + "grad_norm": 0.5742762088775635, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0369, + "step": 15270 + }, + { + "epoch": 0.4368834882058613, + "grad_norm": 0.42058590054512024, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0378, + "step": 15280 + }, + { + "epoch": 0.43716940671908505, + "grad_norm": 0.43729284405708313, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0352, + "step": 15290 + }, + { + "epoch": 0.4374553252323088, + "grad_norm": 0.4689466953277588, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0433, + "step": 15300 + }, + { + "epoch": 0.4377412437455325, + "grad_norm": 0.6272432208061218, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0548, + "step": 15310 + }, + { + "epoch": 0.43802716225875626, + "grad_norm": 1.1129611730575562, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.0437, + "step": 15320 + }, + { + "epoch": 0.43831308077198, + "grad_norm": 0.9332655072212219, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0503, + "step": 15330 + }, + { + "epoch": 0.43859899928520374, + "grad_norm": 0.35150477290153503, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0351, + "step": 15340 + }, + { + "epoch": 0.4388849177984274, + "grad_norm": 0.3826565444469452, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0361, + "step": 15350 + }, + { + "epoch": 0.43917083631165116, + "grad_norm": 0.817319393157959, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0352, + "step": 15360 + }, + { + "epoch": 0.4394567548248749, + "grad_norm": 0.4379598796367645, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0469, + "step": 15370 + }, + { + "epoch": 0.43974267333809863, + "grad_norm": 0.6475314497947693, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0456, + "step": 15380 + }, + { + "epoch": 0.44002859185132237, + "grad_norm": 0.529088020324707, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0453, + "step": 15390 + }, + { + "epoch": 0.4403145103645461, + "grad_norm": 0.4915194809436798, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0369, + "step": 15400 + }, + { + "epoch": 0.44060042887776985, + "grad_norm": 0.4766380786895752, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0391, + "step": 15410 + }, + { + "epoch": 0.4408863473909936, + "grad_norm": 0.34667786955833435, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0327, + "step": 15420 + }, + { + "epoch": 0.4411722659042173, + "grad_norm": 0.504242479801178, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0413, + "step": 15430 + }, + { + "epoch": 0.441458184417441, + "grad_norm": 0.49786439538002014, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0361, + "step": 15440 + }, + { + "epoch": 0.44174410293066474, + "grad_norm": 0.4997329115867615, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0368, + "step": 15450 + }, + { + "epoch": 0.4420300214438885, + "grad_norm": 0.2992185056209564, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0359, + "step": 15460 + }, + { + "epoch": 0.4423159399571122, + "grad_norm": 0.6645393371582031, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0401, + "step": 15470 + }, + { + "epoch": 0.44260185847033595, + "grad_norm": 0.6327983140945435, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0386, + "step": 15480 + }, + { + "epoch": 0.4428877769835597, + "grad_norm": 0.45607903599739075, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0386, + "step": 15490 + }, + { + "epoch": 0.4431736954967834, + "grad_norm": 0.4401610493659973, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0417, + "step": 15500 + }, + { + "epoch": 0.44345961401000716, + "grad_norm": 0.5778466463088989, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.0417, + "step": 15510 + }, + { + "epoch": 0.4437455325232309, + "grad_norm": 0.2164914309978485, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0355, + "step": 15520 + }, + { + "epoch": 0.4440314510364546, + "grad_norm": 0.3869318664073944, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.0361, + "step": 15530 + }, + { + "epoch": 0.4443173695496783, + "grad_norm": 0.3843154311180115, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0459, + "step": 15540 + }, + { + "epoch": 0.44460328806290206, + "grad_norm": 0.8488825559616089, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0406, + "step": 15550 + }, + { + "epoch": 0.4448892065761258, + "grad_norm": 0.5055183172225952, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0359, + "step": 15560 + }, + { + "epoch": 0.44517512508934953, + "grad_norm": 0.40923011302948, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0435, + "step": 15570 + }, + { + "epoch": 0.44546104360257327, + "grad_norm": 0.48997730016708374, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0395, + "step": 15580 + }, + { + "epoch": 0.445746962115797, + "grad_norm": 0.5149131417274475, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.041, + "step": 15590 + }, + { + "epoch": 0.44603288062902074, + "grad_norm": 0.7277303338050842, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0452, + "step": 15600 + }, + { + "epoch": 0.4463187991422445, + "grad_norm": 0.48676377534866333, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0363, + "step": 15610 + }, + { + "epoch": 0.44660471765546816, + "grad_norm": 0.49031221866607666, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0356, + "step": 15620 + }, + { + "epoch": 0.4468906361686919, + "grad_norm": 0.38877514004707336, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.036, + "step": 15630 + }, + { + "epoch": 0.44717655468191564, + "grad_norm": 0.570068895816803, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0403, + "step": 15640 + }, + { + "epoch": 0.4474624731951394, + "grad_norm": 0.48499882221221924, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0395, + "step": 15650 + }, + { + "epoch": 0.4477483917083631, + "grad_norm": 0.7251732349395752, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0399, + "step": 15660 + }, + { + "epoch": 0.44803431022158685, + "grad_norm": 0.3927334249019623, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0359, + "step": 15670 + }, + { + "epoch": 0.4483202287348106, + "grad_norm": 0.5614549517631531, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.035, + "step": 15680 + }, + { + "epoch": 0.4486061472480343, + "grad_norm": 0.383831262588501, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0416, + "step": 15690 + }, + { + "epoch": 0.44889206576125806, + "grad_norm": 1.9365276098251343, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0498, + "step": 15700 + }, + { + "epoch": 0.44917798427448175, + "grad_norm": 0.6964924931526184, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.034, + "step": 15710 + }, + { + "epoch": 0.4494639027877055, + "grad_norm": 0.5148108601570129, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0401, + "step": 15720 + }, + { + "epoch": 0.4497498213009292, + "grad_norm": 0.4529317617416382, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0361, + "step": 15730 + }, + { + "epoch": 0.45003573981415296, + "grad_norm": 0.6648512482643127, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0365, + "step": 15740 + }, + { + "epoch": 0.4503216583273767, + "grad_norm": 0.8183113932609558, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0416, + "step": 15750 + }, + { + "epoch": 0.45060757684060043, + "grad_norm": 0.8802638649940491, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0406, + "step": 15760 + }, + { + "epoch": 0.45089349535382417, + "grad_norm": 0.6329004764556885, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0395, + "step": 15770 + }, + { + "epoch": 0.4511794138670479, + "grad_norm": 0.35283520817756653, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0364, + "step": 15780 + }, + { + "epoch": 0.45146533238027164, + "grad_norm": 0.5156061053276062, + "learning_rate": 1.071827766589186e-05, + "loss": 0.031, + "step": 15790 + }, + { + "epoch": 0.4517512508934953, + "grad_norm": 0.37875205278396606, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.0375, + "step": 15800 + }, + { + "epoch": 0.45203716940671906, + "grad_norm": 0.5543273687362671, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0421, + "step": 15810 + }, + { + "epoch": 0.4523230879199428, + "grad_norm": 0.3808431923389435, + "learning_rate": 1.068904422762975e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 0.45260900643316654, + "grad_norm": 0.8648643493652344, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0396, + "step": 15830 + }, + { + "epoch": 0.4528949249463903, + "grad_norm": 0.7893536686897278, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0417, + "step": 15840 + }, + { + "epoch": 0.453180843459614, + "grad_norm": 0.904137134552002, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0384, + "step": 15850 + }, + { + "epoch": 0.45346676197283775, + "grad_norm": 0.6095889806747437, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0457, + "step": 15860 + }, + { + "epoch": 0.4537526804860615, + "grad_norm": 0.5691415667533875, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0438, + "step": 15870 + }, + { + "epoch": 0.4540385989992852, + "grad_norm": 0.37868618965148926, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0414, + "step": 15880 + }, + { + "epoch": 0.4543245175125089, + "grad_norm": 0.7962950468063354, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0405, + "step": 15890 + }, + { + "epoch": 0.45461043602573264, + "grad_norm": 0.8862378597259521, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0475, + "step": 15900 + }, + { + "epoch": 0.4548963545389564, + "grad_norm": 0.8762509822845459, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0472, + "step": 15910 + }, + { + "epoch": 0.4551822730521801, + "grad_norm": 0.6006313562393188, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0417, + "step": 15920 + }, + { + "epoch": 0.45546819156540386, + "grad_norm": 0.3340131938457489, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0374, + "step": 15930 + }, + { + "epoch": 0.4557541100786276, + "grad_norm": 0.2639552056789398, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0387, + "step": 15940 + }, + { + "epoch": 0.45604002859185133, + "grad_norm": 0.42564907670021057, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0376, + "step": 15950 + }, + { + "epoch": 0.45632594710507507, + "grad_norm": 0.503834068775177, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0344, + "step": 15960 + }, + { + "epoch": 0.4566118656182988, + "grad_norm": 0.5962334871292114, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0379, + "step": 15970 + }, + { + "epoch": 0.4568977841315225, + "grad_norm": 0.3271556794643402, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0361, + "step": 15980 + }, + { + "epoch": 0.4571837026447462, + "grad_norm": 0.5501612424850464, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0356, + "step": 15990 + }, + { + "epoch": 0.45746962115796996, + "grad_norm": 1.0399914979934692, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.039, + "step": 16000 + }, + { + "epoch": 0.4577555396711937, + "grad_norm": 0.42251288890838623, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0413, + "step": 16010 + }, + { + "epoch": 0.45804145818441744, + "grad_norm": 0.5694882869720459, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0501, + "step": 16020 + }, + { + "epoch": 0.4583273766976412, + "grad_norm": 0.37367814779281616, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0388, + "step": 16030 + }, + { + "epoch": 0.4586132952108649, + "grad_norm": 0.7947224974632263, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0324, + "step": 16040 + }, + { + "epoch": 0.45889921372408865, + "grad_norm": 0.47871798276901245, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0345, + "step": 16050 + }, + { + "epoch": 0.4591851322373124, + "grad_norm": 1.4443609714508057, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0502, + "step": 16060 + }, + { + "epoch": 0.45947105075053607, + "grad_norm": 0.8326191902160645, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0325, + "step": 16070 + }, + { + "epoch": 0.4597569692637598, + "grad_norm": 0.2887400686740875, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.035, + "step": 16080 + }, + { + "epoch": 0.46004288777698354, + "grad_norm": 0.34353405237197876, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0324, + "step": 16090 + }, + { + "epoch": 0.4603288062902073, + "grad_norm": 0.7319850325584412, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0307, + "step": 16100 + }, + { + "epoch": 0.460614724803431, + "grad_norm": 0.6628556847572327, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0398, + "step": 16110 + }, + { + "epoch": 0.46090064331665476, + "grad_norm": 0.39974722266197205, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.038, + "step": 16120 + }, + { + "epoch": 0.4611865618298785, + "grad_norm": 0.7769339680671692, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0425, + "step": 16130 + }, + { + "epoch": 0.46147248034310223, + "grad_norm": 0.6823691129684448, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.039, + "step": 16140 + }, + { + "epoch": 0.46175839885632597, + "grad_norm": 0.6749460697174072, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0388, + "step": 16150 + }, + { + "epoch": 0.46204431736954965, + "grad_norm": 1.0745635032653809, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.0406, + "step": 16160 + }, + { + "epoch": 0.4623302358827734, + "grad_norm": 0.8388734459877014, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0345, + "step": 16170 + }, + { + "epoch": 0.4626161543959971, + "grad_norm": 0.675828218460083, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0355, + "step": 16180 + }, + { + "epoch": 0.46290207290922086, + "grad_norm": 0.9872504472732544, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.0374, + "step": 16190 + }, + { + "epoch": 0.4631879914224446, + "grad_norm": 0.4705125689506531, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0416, + "step": 16200 + }, + { + "epoch": 0.46347390993566834, + "grad_norm": 0.43577539920806885, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.041, + "step": 16210 + }, + { + "epoch": 0.4637598284488921, + "grad_norm": 0.6472166180610657, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0372, + "step": 16220 + }, + { + "epoch": 0.4640457469621158, + "grad_norm": 1.0108906030654907, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0464, + "step": 16230 + }, + { + "epoch": 0.46433166547533955, + "grad_norm": 0.6221884489059448, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0396, + "step": 16240 + }, + { + "epoch": 0.46461758398856323, + "grad_norm": 0.7375202178955078, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0365, + "step": 16250 + }, + { + "epoch": 0.46490350250178697, + "grad_norm": 0.5090222358703613, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0404, + "step": 16260 + }, + { + "epoch": 0.4651894210150107, + "grad_norm": 0.5641722679138184, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0424, + "step": 16270 + }, + { + "epoch": 0.46547533952823444, + "grad_norm": 0.3946240246295929, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0433, + "step": 16280 + }, + { + "epoch": 0.4657612580414582, + "grad_norm": 0.525059700012207, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0399, + "step": 16290 + }, + { + "epoch": 0.4660471765546819, + "grad_norm": 0.6106441617012024, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0417, + "step": 16300 + }, + { + "epoch": 0.46633309506790566, + "grad_norm": 0.7064299583435059, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.0331, + "step": 16310 + }, + { + "epoch": 0.4666190135811294, + "grad_norm": 0.6251654624938965, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.0377, + "step": 16320 + }, + { + "epoch": 0.46690493209435313, + "grad_norm": 0.6626482009887695, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0355, + "step": 16330 + }, + { + "epoch": 0.4671908506075768, + "grad_norm": 0.32827794551849365, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0438, + "step": 16340 + }, + { + "epoch": 0.46747676912080055, + "grad_norm": 1.147644281387329, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.041, + "step": 16350 + }, + { + "epoch": 0.4677626876340243, + "grad_norm": 0.5785626769065857, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0362, + "step": 16360 + }, + { + "epoch": 0.468048606147248, + "grad_norm": 0.7087936401367188, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0364, + "step": 16370 + }, + { + "epoch": 0.46833452466047176, + "grad_norm": 0.7729533314704895, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0357, + "step": 16380 + }, + { + "epoch": 0.4686204431736955, + "grad_norm": 0.9080077409744263, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0445, + "step": 16390 + }, + { + "epoch": 0.46890636168691924, + "grad_norm": 0.5273067355155945, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0395, + "step": 16400 + }, + { + "epoch": 0.469192280200143, + "grad_norm": 0.4801991581916809, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0469, + "step": 16410 + }, + { + "epoch": 0.4694781987133667, + "grad_norm": 0.38060688972473145, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0377, + "step": 16420 + }, + { + "epoch": 0.4697641172265904, + "grad_norm": 1.335648536682129, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0444, + "step": 16430 + }, + { + "epoch": 0.47005003573981413, + "grad_norm": 0.6224690079689026, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0365, + "step": 16440 + }, + { + "epoch": 0.47033595425303787, + "grad_norm": 0.39938899874687195, + "learning_rate": 1.007637577910799e-05, + "loss": 0.037, + "step": 16450 + }, + { + "epoch": 0.4706218727662616, + "grad_norm": 0.47899872064590454, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0371, + "step": 16460 + }, + { + "epoch": 0.47090779127948534, + "grad_norm": 0.8991144895553589, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0337, + "step": 16470 + }, + { + "epoch": 0.4711937097927091, + "grad_norm": 0.6228598356246948, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0388, + "step": 16480 + }, + { + "epoch": 0.4714796283059328, + "grad_norm": 0.41108259558677673, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0378, + "step": 16490 + }, + { + "epoch": 0.47176554681915656, + "grad_norm": 0.722955048084259, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0381, + "step": 16500 + }, + { + "epoch": 0.4720514653323803, + "grad_norm": 0.6090973019599915, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0348, + "step": 16510 + }, + { + "epoch": 0.472337383845604, + "grad_norm": 0.483549565076828, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0456, + "step": 16520 + }, + { + "epoch": 0.4726233023588277, + "grad_norm": 0.4134727418422699, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0444, + "step": 16530 + }, + { + "epoch": 0.47290922087205145, + "grad_norm": 0.4629753530025482, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0382, + "step": 16540 + }, + { + "epoch": 0.4731951393852752, + "grad_norm": 0.8709504008293152, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0384, + "step": 16550 + }, + { + "epoch": 0.4734810578984989, + "grad_norm": 0.683397114276886, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0398, + "step": 16560 + }, + { + "epoch": 0.47376697641172266, + "grad_norm": 0.5743465423583984, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0431, + "step": 16570 + }, + { + "epoch": 0.4740528949249464, + "grad_norm": 1.0080480575561523, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0378, + "step": 16580 + }, + { + "epoch": 0.47433881343817014, + "grad_norm": 0.4668700098991394, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0369, + "step": 16590 + }, + { + "epoch": 0.4746247319513939, + "grad_norm": 0.6005896925926208, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0508, + "step": 16600 + }, + { + "epoch": 0.47491065046461756, + "grad_norm": 0.5788530707359314, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0354, + "step": 16610 + }, + { + "epoch": 0.4751965689778413, + "grad_norm": 0.38784441351890564, + "learning_rate": 9.911670744652783e-06, + "loss": 0.0357, + "step": 16620 + }, + { + "epoch": 0.47548248749106503, + "grad_norm": 0.4809567928314209, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0331, + "step": 16630 + }, + { + "epoch": 0.47576840600428877, + "grad_norm": 0.6647809147834778, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0473, + "step": 16640 + }, + { + "epoch": 0.4760543245175125, + "grad_norm": 0.3968522548675537, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0304, + "step": 16650 + }, + { + "epoch": 0.47634024303073624, + "grad_norm": 0.3258526027202606, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0387, + "step": 16660 + }, + { + "epoch": 0.47662616154396, + "grad_norm": 0.43442079424858093, + "learning_rate": 9.863295834019308e-06, + "loss": 0.04, + "step": 16670 + }, + { + "epoch": 0.4769120800571837, + "grad_norm": 0.36909565329551697, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0351, + "step": 16680 + }, + { + "epoch": 0.47719799857040746, + "grad_norm": 0.5566768050193787, + "learning_rate": 9.843955128197274e-06, + "loss": 0.031, + "step": 16690 + }, + { + "epoch": 0.47748391708363114, + "grad_norm": 0.5705142617225647, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0359, + "step": 16700 + }, + { + "epoch": 0.4777698355968549, + "grad_norm": 0.28931716084480286, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0407, + "step": 16710 + }, + { + "epoch": 0.4780557541100786, + "grad_norm": 0.5509498715400696, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0363, + "step": 16720 + }, + { + "epoch": 0.47834167262330235, + "grad_norm": 0.3564346432685852, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0364, + "step": 16730 + }, + { + "epoch": 0.4786275911365261, + "grad_norm": 0.32734423875808716, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0369, + "step": 16740 + }, + { + "epoch": 0.4789135096497498, + "grad_norm": 0.3048594892024994, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0367, + "step": 16750 + }, + { + "epoch": 0.47919942816297356, + "grad_norm": 0.9007049798965454, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0377, + "step": 16760 + }, + { + "epoch": 0.4794853466761973, + "grad_norm": 0.7010983824729919, + "learning_rate": 9.76664747972605e-06, + "loss": 0.039, + "step": 16770 + }, + { + "epoch": 0.47977126518942104, + "grad_norm": 0.644473135471344, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0466, + "step": 16780 + }, + { + "epoch": 0.4800571837026447, + "grad_norm": 0.6333492398262024, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0373, + "step": 16790 + }, + { + "epoch": 0.48034310221586846, + "grad_norm": 0.5148355960845947, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0392, + "step": 16800 + }, + { + "epoch": 0.4806290207290922, + "grad_norm": 0.7288355231285095, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0381, + "step": 16810 + }, + { + "epoch": 0.48091493924231593, + "grad_norm": 0.3674873113632202, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0418, + "step": 16820 + }, + { + "epoch": 0.48120085775553967, + "grad_norm": 0.5055420398712158, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0336, + "step": 16830 + }, + { + "epoch": 0.4814867762687634, + "grad_norm": 0.641754686832428, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0342, + "step": 16840 + }, + { + "epoch": 0.48177269478198714, + "grad_norm": 0.308200478553772, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0364, + "step": 16850 + }, + { + "epoch": 0.4820586132952109, + "grad_norm": 0.41361021995544434, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0342, + "step": 16860 + }, + { + "epoch": 0.4823445318084346, + "grad_norm": 0.45777833461761475, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0353, + "step": 16870 + }, + { + "epoch": 0.4826304503216583, + "grad_norm": 0.7587664723396301, + "learning_rate": 9.660501900166734e-06, + "loss": 0.043, + "step": 16880 + }, + { + "epoch": 0.48291636883488204, + "grad_norm": 0.8740283250808716, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0372, + "step": 16890 + }, + { + "epoch": 0.4832022873481058, + "grad_norm": 0.3009270429611206, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0373, + "step": 16900 + }, + { + "epoch": 0.4834882058613295, + "grad_norm": 0.4439285695552826, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0349, + "step": 16910 + }, + { + "epoch": 0.48377412437455325, + "grad_norm": 0.39849671721458435, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0394, + "step": 16920 + }, + { + "epoch": 0.484060042887777, + "grad_norm": 0.6423043608665466, + "learning_rate": 9.612315882780393e-06, + "loss": 0.0413, + "step": 16930 + }, + { + "epoch": 0.4843459614010007, + "grad_norm": 0.3683928847312927, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0387, + "step": 16940 + }, + { + "epoch": 0.48463187991422446, + "grad_norm": 0.7087769508361816, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0397, + "step": 16950 + }, + { + "epoch": 0.4849177984274482, + "grad_norm": 0.5348120927810669, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0405, + "step": 16960 + }, + { + "epoch": 0.4852037169406719, + "grad_norm": 0.549891471862793, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0363, + "step": 16970 + }, + { + "epoch": 0.4854896354538956, + "grad_norm": 0.7177272439002991, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0343, + "step": 16980 + }, + { + "epoch": 0.48577555396711936, + "grad_norm": 0.595417320728302, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0439, + "step": 16990 + }, + { + "epoch": 0.4860614724803431, + "grad_norm": 0.4838889241218567, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0387, + "step": 17000 + }, + { + "epoch": 0.48634739099356683, + "grad_norm": 0.6186223030090332, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0362, + "step": 17010 + }, + { + "epoch": 0.48663330950679057, + "grad_norm": 0.43383121490478516, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0381, + "step": 17020 + }, + { + "epoch": 0.4869192280200143, + "grad_norm": 0.6735527515411377, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0388, + "step": 17030 + }, + { + "epoch": 0.48720514653323804, + "grad_norm": 0.3746320605278015, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0491, + "step": 17040 + }, + { + "epoch": 0.4874910650464618, + "grad_norm": 0.29500988125801086, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0395, + "step": 17050 + }, + { + "epoch": 0.48777698355968546, + "grad_norm": 0.8518465757369995, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0435, + "step": 17060 + }, + { + "epoch": 0.4880629020729092, + "grad_norm": 0.9653190970420837, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0393, + "step": 17070 + }, + { + "epoch": 0.48834882058613294, + "grad_norm": 0.785724937915802, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0372, + "step": 17080 + }, + { + "epoch": 0.4886347390993567, + "grad_norm": 0.9450638890266418, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0406, + "step": 17090 + }, + { + "epoch": 0.4889206576125804, + "grad_norm": 0.645124077796936, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0361, + "step": 17100 + }, + { + "epoch": 0.48920657612580415, + "grad_norm": 0.3352372944355011, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0417, + "step": 17110 + }, + { + "epoch": 0.4894924946390279, + "grad_norm": 0.3858814835548401, + "learning_rate": 9.42959233811777e-06, + "loss": 0.0345, + "step": 17120 + }, + { + "epoch": 0.4897784131522516, + "grad_norm": 0.5403604507446289, + "learning_rate": 9.419993062475743e-06, + "loss": 0.0326, + "step": 17130 + }, + { + "epoch": 0.49006433166547536, + "grad_norm": 0.6986777782440186, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0417, + "step": 17140 + }, + { + "epoch": 0.49035025017869904, + "grad_norm": 0.5456675887107849, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0473, + "step": 17150 + }, + { + "epoch": 0.4906361686919228, + "grad_norm": 0.3961554765701294, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0341, + "step": 17160 + }, + { + "epoch": 0.4909220872051465, + "grad_norm": 0.5188277363777161, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0369, + "step": 17170 + }, + { + "epoch": 0.49120800571837026, + "grad_norm": 0.6042230725288391, + "learning_rate": 9.372024722887089e-06, + "loss": 0.0352, + "step": 17180 + }, + { + "epoch": 0.491493924231594, + "grad_norm": 0.5485941171646118, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0405, + "step": 17190 + }, + { + "epoch": 0.49177984274481773, + "grad_norm": 0.5856509804725647, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0402, + "step": 17200 + }, + { + "epoch": 0.49206576125804147, + "grad_norm": 0.8656556010246277, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0349, + "step": 17210 + }, + { + "epoch": 0.4923516797712652, + "grad_norm": 0.4041757583618164, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0364, + "step": 17220 + }, + { + "epoch": 0.49263759828448894, + "grad_norm": 0.6135975122451782, + "learning_rate": 9.324104146177972e-06, + "loss": 0.036, + "step": 17230 + }, + { + "epoch": 0.4929235167977126, + "grad_norm": 0.5101860165596008, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0359, + "step": 17240 + }, + { + "epoch": 0.49320943531093636, + "grad_norm": 0.9913426041603088, + "learning_rate": 9.304949604077693e-06, + "loss": 0.0552, + "step": 17250 + }, + { + "epoch": 0.4934953538241601, + "grad_norm": 0.6148158311843872, + "learning_rate": 9.295375311262483e-06, + "loss": 0.0388, + "step": 17260 + }, + { + "epoch": 0.49378127233738384, + "grad_norm": 0.6651721596717834, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0374, + "step": 17270 + }, + { + "epoch": 0.4940671908506076, + "grad_norm": 0.9545061588287354, + "learning_rate": 9.276232738281744e-06, + "loss": 0.035, + "step": 17280 + }, + { + "epoch": 0.4943531093638313, + "grad_norm": 0.8923225402832031, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0366, + "step": 17290 + }, + { + "epoch": 0.49463902787705505, + "grad_norm": 0.5337848663330078, + "learning_rate": 9.257098257046206e-06, + "loss": 0.0354, + "step": 17300 + }, + { + "epoch": 0.4949249463902788, + "grad_norm": 0.35039281845092773, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0341, + "step": 17310 + }, + { + "epoch": 0.4952108649035025, + "grad_norm": 0.47406911849975586, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0393, + "step": 17320 + }, + { + "epoch": 0.4954967834167262, + "grad_norm": 0.6226631999015808, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0375, + "step": 17330 + }, + { + "epoch": 0.49578270192994994, + "grad_norm": 0.6652712821960449, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0363, + "step": 17340 + }, + { + "epoch": 0.4960686204431737, + "grad_norm": 1.0042835474014282, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0368, + "step": 17350 + }, + { + "epoch": 0.4963545389563974, + "grad_norm": 0.4334045648574829, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0375, + "step": 17360 + }, + { + "epoch": 0.49664045746962115, + "grad_norm": 0.3561633229255676, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0347, + "step": 17370 + }, + { + "epoch": 0.4969263759828449, + "grad_norm": 0.5763550996780396, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0344, + "step": 17380 + }, + { + "epoch": 0.49721229449606863, + "grad_norm": 0.6306643486022949, + "learning_rate": 9.171095634265995e-06, + "loss": 0.037, + "step": 17390 + }, + { + "epoch": 0.49749821300929237, + "grad_norm": 0.4286569058895111, + "learning_rate": 9.161550369445782e-06, + "loss": 0.0308, + "step": 17400 + }, + { + "epoch": 0.4977841315225161, + "grad_norm": 0.577983558177948, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0393, + "step": 17410 + }, + { + "epoch": 0.4980700500357398, + "grad_norm": 0.5714932084083557, + "learning_rate": 9.142466323573853e-06, + "loss": 0.038, + "step": 17420 + }, + { + "epoch": 0.4983559685489635, + "grad_norm": 0.7529498338699341, + "learning_rate": 9.132927564918328e-06, + "loss": 0.033, + "step": 17430 + }, + { + "epoch": 0.49864188706218726, + "grad_norm": 0.5179672241210938, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0367, + "step": 17440 + }, + { + "epoch": 0.498927805575411, + "grad_norm": 0.38424569368362427, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0401, + "step": 17450 + }, + { + "epoch": 0.49921372408863474, + "grad_norm": 0.469460129737854, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0379, + "step": 17460 + }, + { + "epoch": 0.4994996426018585, + "grad_norm": 0.3285387456417084, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0399, + "step": 17470 + }, + { + "epoch": 0.4997855611150822, + "grad_norm": 0.49863550066947937, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0313, + "step": 17480 + }, + { + "epoch": 0.5000714796283059, + "grad_norm": 0.3926186263561249, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0454, + "step": 17490 + }, + { + "epoch": 0.5003573981415297, + "grad_norm": 0.4476146399974823, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0472, + "step": 17500 + }, + { + "epoch": 0.5006433166547534, + "grad_norm": 0.5645599961280823, + "learning_rate": 9.05669731553499e-06, + "loss": 0.0358, + "step": 17510 + }, + { + "epoch": 0.5009292351679772, + "grad_norm": 0.4813307225704193, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0364, + "step": 17520 + }, + { + "epoch": 0.5012151536812008, + "grad_norm": 0.49410971999168396, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0303, + "step": 17530 + }, + { + "epoch": 0.5015010721944246, + "grad_norm": 0.7172105312347412, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0404, + "step": 17540 + }, + { + "epoch": 0.5017869907076483, + "grad_norm": 0.43401873111724854, + "learning_rate": 9.018636566864313e-06, + "loss": 0.0402, + "step": 17550 + }, + { + "epoch": 0.502072909220872, + "grad_norm": 0.6497406363487244, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0364, + "step": 17560 + }, + { + "epoch": 0.5023588277340958, + "grad_norm": 0.44618356227874756, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0337, + "step": 17570 + }, + { + "epoch": 0.5026447462473195, + "grad_norm": 0.4186992049217224, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0381, + "step": 17580 + }, + { + "epoch": 0.5029306647605433, + "grad_norm": 0.7387974858283997, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0319, + "step": 17590 + }, + { + "epoch": 0.503216583273767, + "grad_norm": 0.8068642020225525, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0373, + "step": 17600 + }, + { + "epoch": 0.5035025017869907, + "grad_norm": 0.5773473978042603, + "learning_rate": 8.961615424107555e-06, + "loss": 0.0372, + "step": 17610 + }, + { + "epoch": 0.5037884203002144, + "grad_norm": 0.32488778233528137, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0334, + "step": 17620 + }, + { + "epoch": 0.5040743388134382, + "grad_norm": 0.33978500962257385, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0493, + "step": 17630 + }, + { + "epoch": 0.5043602573266619, + "grad_norm": 0.5897071361541748, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0335, + "step": 17640 + }, + { + "epoch": 0.5046461758398856, + "grad_norm": 0.6275895833969116, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0395, + "step": 17650 + }, + { + "epoch": 0.5049320943531094, + "grad_norm": 0.7995536923408508, + "learning_rate": 8.914163487132906e-06, + "loss": 0.0422, + "step": 17660 + }, + { + "epoch": 0.505218012866333, + "grad_norm": 0.8734716773033142, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0414, + "step": 17670 + }, + { + "epoch": 0.5055039313795568, + "grad_norm": 0.6239343881607056, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0333, + "step": 17680 + }, + { + "epoch": 0.5057898498927805, + "grad_norm": 0.42508623003959656, + "learning_rate": 8.885721609997551e-06, + "loss": 0.045, + "step": 17690 + }, + { + "epoch": 0.5060757684060043, + "grad_norm": 0.4272485673427582, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0506, + "step": 17700 + }, + { + "epoch": 0.506361686919228, + "grad_norm": 0.8006368279457092, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0431, + "step": 17710 + }, + { + "epoch": 0.5066476054324518, + "grad_norm": 0.5896835327148438, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0322, + "step": 17720 + }, + { + "epoch": 0.5069335239456755, + "grad_norm": 0.6880389451980591, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0322, + "step": 17730 + }, + { + "epoch": 0.5072194424588992, + "grad_norm": 1.4850202798843384, + "learning_rate": 8.83836825410936e-06, + "loss": 0.052, + "step": 17740 + }, + { + "epoch": 0.507505360972123, + "grad_norm": 0.7684240937232971, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0353, + "step": 17750 + }, + { + "epoch": 0.5077912794853466, + "grad_norm": 0.5456307530403137, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0419, + "step": 17760 + }, + { + "epoch": 0.5080771979985704, + "grad_norm": 0.5775120258331299, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0366, + "step": 17770 + }, + { + "epoch": 0.5083631165117941, + "grad_norm": 0.6453070044517517, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0341, + "step": 17780 + }, + { + "epoch": 0.5086490350250179, + "grad_norm": 0.7906973361968994, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0405, + "step": 17790 + }, + { + "epoch": 0.5089349535382416, + "grad_norm": 1.0740606784820557, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0344, + "step": 17800 + }, + { + "epoch": 0.5092208720514654, + "grad_norm": 0.41854357719421387, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0334, + "step": 17810 + }, + { + "epoch": 0.5095067905646891, + "grad_norm": 0.6328964233398438, + "learning_rate": 8.762735374981932e-06, + "loss": 0.036, + "step": 17820 + }, + { + "epoch": 0.5097927090779127, + "grad_norm": 0.40875789523124695, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0338, + "step": 17830 + }, + { + "epoch": 0.5100786275911365, + "grad_norm": 0.5056312084197998, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0332, + "step": 17840 + }, + { + "epoch": 0.5103645461043602, + "grad_norm": 0.5005037784576416, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0416, + "step": 17850 + }, + { + "epoch": 0.510650464617584, + "grad_norm": 0.5689167380332947, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0329, + "step": 17860 + }, + { + "epoch": 0.5109363831308077, + "grad_norm": 0.5222717523574829, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0336, + "step": 17870 + }, + { + "epoch": 0.5112223016440315, + "grad_norm": 0.5998329520225525, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0354, + "step": 17880 + }, + { + "epoch": 0.5115082201572552, + "grad_norm": 0.4684480130672455, + "learning_rate": 8.69669425266315e-06, + "loss": 0.05, + "step": 17890 + }, + { + "epoch": 0.511794138670479, + "grad_norm": 0.4061124622821808, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0384, + "step": 17900 + }, + { + "epoch": 0.5120800571837026, + "grad_norm": 0.5025928020477295, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0386, + "step": 17910 + }, + { + "epoch": 0.5123659756969263, + "grad_norm": 0.3731222152709961, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0378, + "step": 17920 + }, + { + "epoch": 0.5126518942101501, + "grad_norm": 0.7784973978996277, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0419, + "step": 17930 + }, + { + "epoch": 0.5129378127233738, + "grad_norm": 0.7074074745178223, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0386, + "step": 17940 + }, + { + "epoch": 0.5132237312365976, + "grad_norm": 0.49802306294441223, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0418, + "step": 17950 + }, + { + "epoch": 0.5135096497498213, + "grad_norm": 0.4355427920818329, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0431, + "step": 17960 + }, + { + "epoch": 0.5137955682630451, + "grad_norm": 0.672635555267334, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0403, + "step": 17970 + }, + { + "epoch": 0.5140814867762687, + "grad_norm": 0.6733908653259277, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0487, + "step": 17980 + }, + { + "epoch": 0.5143674052894925, + "grad_norm": 0.43711504340171814, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0378, + "step": 17990 + }, + { + "epoch": 0.5146533238027162, + "grad_norm": 0.6371222138404846, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0336, + "step": 18000 + }, + { + "epoch": 0.5149392423159399, + "grad_norm": 0.8007041811943054, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0371, + "step": 18010 + }, + { + "epoch": 0.5152251608291637, + "grad_norm": 0.4725078344345093, + "learning_rate": 8.574400723012433e-06, + "loss": 0.037, + "step": 18020 + }, + { + "epoch": 0.5155110793423874, + "grad_norm": 0.34229791164398193, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0353, + "step": 18030 + }, + { + "epoch": 0.5157969978556112, + "grad_norm": 0.27863454818725586, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0371, + "step": 18040 + }, + { + "epoch": 0.5160829163688349, + "grad_norm": 0.43021920323371887, + "learning_rate": 8.54624657467318e-06, + "loss": 0.0419, + "step": 18050 + }, + { + "epoch": 0.5163688348820586, + "grad_norm": 0.4683758318424225, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0307, + "step": 18060 + }, + { + "epoch": 0.5166547533952823, + "grad_norm": 0.29085367918014526, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0372, + "step": 18070 + }, + { + "epoch": 0.5169406719085061, + "grad_norm": 0.4396727681159973, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0328, + "step": 18080 + }, + { + "epoch": 0.5172265904217298, + "grad_norm": 0.539021372795105, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0317, + "step": 18090 + }, + { + "epoch": 0.5175125089349535, + "grad_norm": 0.556974470615387, + "learning_rate": 8.499380733111628e-06, + "loss": 0.037, + "step": 18100 + }, + { + "epoch": 0.5177984274481773, + "grad_norm": 0.4445747137069702, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0344, + "step": 18110 + }, + { + "epoch": 0.518084345961401, + "grad_norm": 0.3742713928222656, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0339, + "step": 18120 + }, + { + "epoch": 0.5183702644746248, + "grad_norm": 0.8467416167259216, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0409, + "step": 18130 + }, + { + "epoch": 0.5186561829878484, + "grad_norm": 0.7731484770774841, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0379, + "step": 18140 + }, + { + "epoch": 0.5189421015010722, + "grad_norm": 0.5664084553718567, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0353, + "step": 18150 + }, + { + "epoch": 0.5192280200142959, + "grad_norm": 0.5623966455459595, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0412, + "step": 18160 + }, + { + "epoch": 0.5195139385275197, + "grad_norm": 0.5074556469917297, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0402, + "step": 18170 + }, + { + "epoch": 0.5197998570407434, + "grad_norm": 0.49439728260040283, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0341, + "step": 18180 + }, + { + "epoch": 0.5200857755539671, + "grad_norm": 0.5982527136802673, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0308, + "step": 18190 + }, + { + "epoch": 0.5203716940671909, + "grad_norm": 0.7891598343849182, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0437, + "step": 18200 + }, + { + "epoch": 0.5206576125804145, + "grad_norm": 0.7565666437149048, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0381, + "step": 18210 + }, + { + "epoch": 0.5209435310936383, + "grad_norm": 0.33346351981163025, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0454, + "step": 18220 + }, + { + "epoch": 0.521229449606862, + "grad_norm": 0.5885659456253052, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0413, + "step": 18230 + }, + { + "epoch": 0.5215153681200858, + "grad_norm": 0.6487091183662415, + "learning_rate": 8.368551060444755e-06, + "loss": 0.035, + "step": 18240 + }, + { + "epoch": 0.5218012866333095, + "grad_norm": 0.9817430377006531, + "learning_rate": 8.359228888944986e-06, + "loss": 0.0394, + "step": 18250 + }, + { + "epoch": 0.5220872051465333, + "grad_norm": 0.5691193342208862, + "learning_rate": 8.349909816537207e-06, + "loss": 0.041, + "step": 18260 + }, + { + "epoch": 0.522373123659757, + "grad_norm": 0.5326661467552185, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0361, + "step": 18270 + }, + { + "epoch": 0.5226590421729806, + "grad_norm": 0.5536142587661743, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0406, + "step": 18280 + }, + { + "epoch": 0.5229449606862044, + "grad_norm": 0.3482394218444824, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0423, + "step": 18290 + }, + { + "epoch": 0.5232308791994281, + "grad_norm": 0.514914333820343, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0352, + "step": 18300 + }, + { + "epoch": 0.5235167977126519, + "grad_norm": 0.7681404948234558, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0386, + "step": 18310 + }, + { + "epoch": 0.5238027162258756, + "grad_norm": 0.400426983833313, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0333, + "step": 18320 + }, + { + "epoch": 0.5240886347390994, + "grad_norm": 0.4996081590652466, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0381, + "step": 18330 + }, + { + "epoch": 0.5243745532523231, + "grad_norm": 0.5379085540771484, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0348, + "step": 18340 + }, + { + "epoch": 0.5246604717655469, + "grad_norm": 0.4462053179740906, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0307, + "step": 18350 + }, + { + "epoch": 0.5249463902787705, + "grad_norm": 0.7336096167564392, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0345, + "step": 18360 + }, + { + "epoch": 0.5252323087919942, + "grad_norm": 0.6676360368728638, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0346, + "step": 18370 + }, + { + "epoch": 0.525518227305218, + "grad_norm": 0.46608656644821167, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0334, + "step": 18380 + }, + { + "epoch": 0.5258041458184417, + "grad_norm": 0.4906940460205078, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0331, + "step": 18390 + }, + { + "epoch": 0.5260900643316655, + "grad_norm": 0.4200032353401184, + "learning_rate": 8.219774325200873e-06, + "loss": 0.0394, + "step": 18400 + }, + { + "epoch": 0.5263759828448892, + "grad_norm": 0.5663877725601196, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0349, + "step": 18410 + }, + { + "epoch": 0.526661901358113, + "grad_norm": 0.36824384331703186, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0303, + "step": 18420 + }, + { + "epoch": 0.5269478198713367, + "grad_norm": 0.8120076060295105, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0443, + "step": 18430 + }, + { + "epoch": 0.5272337383845604, + "grad_norm": 0.4102472960948944, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0369, + "step": 18440 + }, + { + "epoch": 0.5275196568977841, + "grad_norm": 0.5186526775360107, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0338, + "step": 18450 + }, + { + "epoch": 0.5278055754110078, + "grad_norm": 0.9650108218193054, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0343, + "step": 18460 + }, + { + "epoch": 0.5280914939242316, + "grad_norm": 0.5894375443458557, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0416, + "step": 18470 + }, + { + "epoch": 0.5283774124374553, + "grad_norm": 0.6188816428184509, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0402, + "step": 18480 + }, + { + "epoch": 0.5286633309506791, + "grad_norm": 0.35280847549438477, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0363, + "step": 18490 + }, + { + "epoch": 0.5289492494639028, + "grad_norm": 0.7289313673973083, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0392, + "step": 18500 + }, + { + "epoch": 0.5292351679771266, + "grad_norm": 0.505050778388977, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0329, + "step": 18510 + }, + { + "epoch": 0.5295210864903502, + "grad_norm": 0.7029705047607422, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0344, + "step": 18520 + }, + { + "epoch": 0.529807005003574, + "grad_norm": 0.2958471477031708, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0431, + "step": 18530 + }, + { + "epoch": 0.5300929235167977, + "grad_norm": 0.9649683237075806, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0329, + "step": 18540 + }, + { + "epoch": 0.5303788420300214, + "grad_norm": 0.24733735620975494, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0354, + "step": 18550 + }, + { + "epoch": 0.5306647605432452, + "grad_norm": 0.44838136434555054, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0334, + "step": 18560 + }, + { + "epoch": 0.5309506790564689, + "grad_norm": 0.4505597949028015, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0338, + "step": 18570 + }, + { + "epoch": 0.5312365975696927, + "grad_norm": 0.44188442826271057, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0326, + "step": 18580 + }, + { + "epoch": 0.5315225160829163, + "grad_norm": 0.4539152979850769, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0369, + "step": 18590 + }, + { + "epoch": 0.5318084345961401, + "grad_norm": 0.8311023712158203, + "learning_rate": 8.03498318084394e-06, + "loss": 0.0441, + "step": 18600 + }, + { + "epoch": 0.5320943531093638, + "grad_norm": 0.53764808177948, + "learning_rate": 8.025779439806006e-06, + "loss": 0.037, + "step": 18610 + }, + { + "epoch": 0.5323802716225876, + "grad_norm": 1.2192102670669556, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0369, + "step": 18620 + }, + { + "epoch": 0.5326661901358113, + "grad_norm": 0.5254611968994141, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0371, + "step": 18630 + }, + { + "epoch": 0.532952108649035, + "grad_norm": 0.585709810256958, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0337, + "step": 18640 + }, + { + "epoch": 0.5332380271622588, + "grad_norm": 0.45416259765625, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0425, + "step": 18650 + }, + { + "epoch": 0.5335239456754824, + "grad_norm": 0.3957739472389221, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0354, + "step": 18660 + }, + { + "epoch": 0.5338098641887062, + "grad_norm": 0.6211117506027222, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0347, + "step": 18670 + }, + { + "epoch": 0.5340957827019299, + "grad_norm": 0.49023327231407166, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0308, + "step": 18680 + }, + { + "epoch": 0.5343817012151537, + "grad_norm": 0.5823351144790649, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0351, + "step": 18690 + }, + { + "epoch": 0.5346676197283774, + "grad_norm": 0.6048677563667297, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0382, + "step": 18700 + }, + { + "epoch": 0.5349535382416012, + "grad_norm": 0.5293828845024109, + "learning_rate": 7.933935782312965e-06, + "loss": 0.0329, + "step": 18710 + }, + { + "epoch": 0.5352394567548249, + "grad_norm": 0.5935509204864502, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0388, + "step": 18720 + }, + { + "epoch": 0.5355253752680486, + "grad_norm": 0.8369598388671875, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0421, + "step": 18730 + }, + { + "epoch": 0.5358112937812723, + "grad_norm": 0.6874870657920837, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0385, + "step": 18740 + }, + { + "epoch": 0.536097212294496, + "grad_norm": 0.43511492013931274, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0405, + "step": 18750 + }, + { + "epoch": 0.5363831308077198, + "grad_norm": 0.662755012512207, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0375, + "step": 18760 + }, + { + "epoch": 0.5366690493209435, + "grad_norm": 0.5519852638244629, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0351, + "step": 18770 + }, + { + "epoch": 0.5369549678341673, + "grad_norm": 0.9711637496948242, + "learning_rate": 7.869858673101027e-06, + "loss": 0.038, + "step": 18780 + }, + { + "epoch": 0.537240886347391, + "grad_norm": 0.4944411516189575, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0416, + "step": 18790 + }, + { + "epoch": 0.5375268048606148, + "grad_norm": 0.5257377624511719, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0349, + "step": 18800 + }, + { + "epoch": 0.5378127233738385, + "grad_norm": 0.4833063781261444, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0414, + "step": 18810 + }, + { + "epoch": 0.5380986418870621, + "grad_norm": 0.4496164917945862, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0369, + "step": 18820 + }, + { + "epoch": 0.5383845604002859, + "grad_norm": 0.6939138174057007, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0337, + "step": 18830 + }, + { + "epoch": 0.5386704789135096, + "grad_norm": 0.32579538226127625, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0371, + "step": 18840 + }, + { + "epoch": 0.5389563974267334, + "grad_norm": 0.35594654083251953, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0366, + "step": 18850 + }, + { + "epoch": 0.5392423159399571, + "grad_norm": 0.6114012002944946, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0372, + "step": 18860 + }, + { + "epoch": 0.5395282344531809, + "grad_norm": 0.8492457270622253, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0346, + "step": 18870 + }, + { + "epoch": 0.5398141529664046, + "grad_norm": 0.5214036703109741, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0388, + "step": 18880 + }, + { + "epoch": 0.5401000714796284, + "grad_norm": 0.428671658039093, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0394, + "step": 18890 + }, + { + "epoch": 0.540385989992852, + "grad_norm": 0.6071562767028809, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0371, + "step": 18900 + }, + { + "epoch": 0.5406719085060757, + "grad_norm": 0.41996505856513977, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0334, + "step": 18910 + }, + { + "epoch": 0.5409578270192995, + "grad_norm": 0.5260844826698303, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0362, + "step": 18920 + }, + { + "epoch": 0.5412437455325232, + "grad_norm": 0.43362122774124146, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0325, + "step": 18930 + }, + { + "epoch": 0.541529664045747, + "grad_norm": 0.4597149193286896, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0345, + "step": 18940 + }, + { + "epoch": 0.5418155825589707, + "grad_norm": 0.6667322516441345, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0421, + "step": 18950 + }, + { + "epoch": 0.5421015010721945, + "grad_norm": 0.8998900651931763, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0368, + "step": 18960 + }, + { + "epoch": 0.5423874195854181, + "grad_norm": 0.5075538158416748, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0327, + "step": 18970 + }, + { + "epoch": 0.5426733380986419, + "grad_norm": 0.38445526361465454, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0446, + "step": 18980 + }, + { + "epoch": 0.5429592566118656, + "grad_norm": 0.696186363697052, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0364, + "step": 18990 + }, + { + "epoch": 0.5432451751250893, + "grad_norm": 0.6371187567710876, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0401, + "step": 19000 + }, + { + "epoch": 0.5435310936383131, + "grad_norm": 0.6122881174087524, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0345, + "step": 19010 + }, + { + "epoch": 0.5438170121515368, + "grad_norm": 0.4222267270088196, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0456, + "step": 19020 + }, + { + "epoch": 0.5441029306647606, + "grad_norm": 0.6122517585754395, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0434, + "step": 19030 + }, + { + "epoch": 0.5443888491779842, + "grad_norm": 0.2783992886543274, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0354, + "step": 19040 + }, + { + "epoch": 0.544674767691208, + "grad_norm": 0.6433000564575195, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0321, + "step": 19050 + }, + { + "epoch": 0.5449606862044317, + "grad_norm": 0.6967030167579651, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0394, + "step": 19060 + }, + { + "epoch": 0.5452466047176555, + "grad_norm": 0.4799044132232666, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0329, + "step": 19070 + }, + { + "epoch": 0.5455325232308792, + "grad_norm": 0.633895993232727, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0316, + "step": 19080 + }, + { + "epoch": 0.5458184417441029, + "grad_norm": 0.5601945519447327, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0449, + "step": 19090 + }, + { + "epoch": 0.5461043602573267, + "grad_norm": 0.4917007088661194, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0351, + "step": 19100 + }, + { + "epoch": 0.5463902787705504, + "grad_norm": 0.4813363254070282, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.029, + "step": 19110 + }, + { + "epoch": 0.5466761972837741, + "grad_norm": 0.5359676480293274, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0346, + "step": 19120 + }, + { + "epoch": 0.5469621157969978, + "grad_norm": 0.6500958204269409, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0374, + "step": 19130 + }, + { + "epoch": 0.5472480343102216, + "grad_norm": 0.7708510756492615, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0332, + "step": 19140 + }, + { + "epoch": 0.5475339528234453, + "grad_norm": 0.45693230628967285, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0344, + "step": 19150 + }, + { + "epoch": 0.5478198713366691, + "grad_norm": 0.6046226620674133, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0342, + "step": 19160 + }, + { + "epoch": 0.5481057898498928, + "grad_norm": 0.5253175497055054, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0449, + "step": 19170 + }, + { + "epoch": 0.5483917083631165, + "grad_norm": 0.3790060877799988, + "learning_rate": 7.507267205473318e-06, + "loss": 0.037, + "step": 19180 + }, + { + "epoch": 0.5486776268763403, + "grad_norm": 0.37709203362464905, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0346, + "step": 19190 + }, + { + "epoch": 0.5489635453895639, + "grad_norm": 0.3940931558609009, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0427, + "step": 19200 + }, + { + "epoch": 0.5492494639027877, + "grad_norm": 0.761299192905426, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0353, + "step": 19210 + }, + { + "epoch": 0.5495353824160114, + "grad_norm": 0.5268495082855225, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0328, + "step": 19220 + }, + { + "epoch": 0.5498213009292352, + "grad_norm": 0.45624151825904846, + "learning_rate": 7.4623904967312e-06, + "loss": 0.0353, + "step": 19230 + }, + { + "epoch": 0.5501072194424589, + "grad_norm": 0.5374972224235535, + "learning_rate": 7.453427567620127e-06, + "loss": 0.0345, + "step": 19240 + }, + { + "epoch": 0.5503931379556827, + "grad_norm": 0.49830907583236694, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0328, + "step": 19250 + }, + { + "epoch": 0.5506790564689064, + "grad_norm": 0.6223296523094177, + "learning_rate": 7.435514206212475e-06, + "loss": 0.037, + "step": 19260 + }, + { + "epoch": 0.55096497498213, + "grad_norm": 0.42801398038864136, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0371, + "step": 19270 + }, + { + "epoch": 0.5512508934953538, + "grad_norm": 0.3872825801372528, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0314, + "step": 19280 + }, + { + "epoch": 0.5515368120085775, + "grad_norm": 0.3967494070529938, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0349, + "step": 19290 + }, + { + "epoch": 0.5518227305218013, + "grad_norm": 0.42383769154548645, + "learning_rate": 7.399737764864619e-06, + "loss": 0.045, + "step": 19300 + }, + { + "epoch": 0.552108649035025, + "grad_norm": 0.48501884937286377, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0373, + "step": 19310 + }, + { + "epoch": 0.5523945675482488, + "grad_norm": 0.3783693015575409, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0334, + "step": 19320 + }, + { + "epoch": 0.5526804860614725, + "grad_norm": 0.5733019709587097, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0369, + "step": 19330 + }, + { + "epoch": 0.5529664045746963, + "grad_norm": 0.5022825002670288, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0375, + "step": 19340 + }, + { + "epoch": 0.5532523230879199, + "grad_norm": 0.5508015155792236, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0415, + "step": 19350 + }, + { + "epoch": 0.5535382416011436, + "grad_norm": 0.5692425966262817, + "learning_rate": 7.346200065486093e-06, + "loss": 0.0401, + "step": 19360 + }, + { + "epoch": 0.5538241601143674, + "grad_norm": 0.7247840762138367, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0373, + "step": 19370 + }, + { + "epoch": 0.5541100786275911, + "grad_norm": 0.633986234664917, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0435, + "step": 19380 + }, + { + "epoch": 0.5543959971408149, + "grad_norm": 0.8598711490631104, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0424, + "step": 19390 + }, + { + "epoch": 0.5546819156540386, + "grad_norm": 0.782328188419342, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0424, + "step": 19400 + }, + { + "epoch": 0.5549678341672624, + "grad_norm": 0.48890456557273865, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0351, + "step": 19410 + }, + { + "epoch": 0.555253752680486, + "grad_norm": 0.4759981036186218, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0395, + "step": 19420 + }, + { + "epoch": 0.5555396711937098, + "grad_norm": 0.6431323885917664, + "learning_rate": 7.283934675167239e-06, + "loss": 0.036, + "step": 19430 + }, + { + "epoch": 0.5558255897069335, + "grad_norm": 0.6633809208869934, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0333, + "step": 19440 + }, + { + "epoch": 0.5561115082201572, + "grad_norm": 0.3405994772911072, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0375, + "step": 19450 + }, + { + "epoch": 0.556397426733381, + "grad_norm": 0.3443987965583801, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0329, + "step": 19460 + }, + { + "epoch": 0.5566833452466047, + "grad_norm": 0.7973398566246033, + "learning_rate": 7.248450164740439e-06, + "loss": 0.0412, + "step": 19470 + }, + { + "epoch": 0.5569692637598285, + "grad_norm": 0.43843239545822144, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0302, + "step": 19480 + }, + { + "epoch": 0.5572551822730522, + "grad_norm": 0.6797782182693481, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0401, + "step": 19490 + }, + { + "epoch": 0.557541100786276, + "grad_norm": 0.5020610690116882, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0329, + "step": 19500 + }, + { + "epoch": 0.5578270192994996, + "grad_norm": 0.5093050003051758, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0381, + "step": 19510 + }, + { + "epoch": 0.5581129378127234, + "grad_norm": 0.6136947870254517, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0373, + "step": 19520 + }, + { + "epoch": 0.5583988563259471, + "grad_norm": 0.4213317930698395, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0338, + "step": 19530 + }, + { + "epoch": 0.5586847748391708, + "grad_norm": 0.6560636162757874, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0332, + "step": 19540 + }, + { + "epoch": 0.5589706933523946, + "grad_norm": 0.41303765773773193, + "learning_rate": 7.177693135871202e-06, + "loss": 0.03, + "step": 19550 + }, + { + "epoch": 0.5592566118656183, + "grad_norm": 0.5260538458824158, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0328, + "step": 19560 + }, + { + "epoch": 0.559542530378842, + "grad_norm": 0.6076327562332153, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0428, + "step": 19570 + }, + { + "epoch": 0.5598284488920657, + "grad_norm": 0.635111927986145, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0318, + "step": 19580 + }, + { + "epoch": 0.5601143674052895, + "grad_norm": 0.7933056354522705, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0357, + "step": 19590 + }, + { + "epoch": 0.5604002859185132, + "grad_norm": 0.44312241673469543, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0289, + "step": 19600 + }, + { + "epoch": 0.560686204431737, + "grad_norm": 0.36346134543418884, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0354, + "step": 19610 + }, + { + "epoch": 0.5609721229449607, + "grad_norm": 0.49605289101600647, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0367, + "step": 19620 + }, + { + "epoch": 0.5612580414581844, + "grad_norm": 0.7115452289581299, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0354, + "step": 19630 + }, + { + "epoch": 0.5615439599714082, + "grad_norm": 0.650925874710083, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0353, + "step": 19640 + }, + { + "epoch": 0.5618298784846318, + "grad_norm": 0.5046663880348206, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0294, + "step": 19650 + }, + { + "epoch": 0.5621157969978556, + "grad_norm": 0.4441855549812317, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0326, + "step": 19660 + }, + { + "epoch": 0.5624017155110793, + "grad_norm": 0.3956650495529175, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0446, + "step": 19670 + }, + { + "epoch": 0.5626876340243031, + "grad_norm": 0.5384211540222168, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0331, + "step": 19680 + }, + { + "epoch": 0.5629735525375268, + "grad_norm": 0.6183366775512695, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0324, + "step": 19690 + }, + { + "epoch": 0.5632594710507506, + "grad_norm": 0.9116242527961731, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0341, + "step": 19700 + }, + { + "epoch": 0.5635453895639743, + "grad_norm": 0.8171015381813049, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0306, + "step": 19710 + }, + { + "epoch": 0.563831308077198, + "grad_norm": 0.42670243978500366, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0336, + "step": 19720 + }, + { + "epoch": 0.5641172265904217, + "grad_norm": 0.7338811159133911, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0363, + "step": 19730 + }, + { + "epoch": 0.5644031451036454, + "grad_norm": 0.5576338171958923, + "learning_rate": 7.010805483338283e-06, + "loss": 0.0371, + "step": 19740 + }, + { + "epoch": 0.5646890636168692, + "grad_norm": 0.7390629649162292, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0382, + "step": 19750 + }, + { + "epoch": 0.5649749821300929, + "grad_norm": 0.801812469959259, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0379, + "step": 19760 + }, + { + "epoch": 0.5652609006433167, + "grad_norm": 0.5697385668754578, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0369, + "step": 19770 + }, + { + "epoch": 0.5655468191565404, + "grad_norm": 0.4180932343006134, + "learning_rate": 6.975884226362e-06, + "loss": 0.039, + "step": 19780 + }, + { + "epoch": 0.5658327376697642, + "grad_norm": 0.648389995098114, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0346, + "step": 19790 + }, + { + "epoch": 0.5661186561829878, + "grad_norm": 0.9673929214477539, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0392, + "step": 19800 + }, + { + "epoch": 0.5664045746962115, + "grad_norm": 0.4793975353240967, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0321, + "step": 19810 + }, + { + "epoch": 0.5666904932094353, + "grad_norm": 0.5206098556518555, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0319, + "step": 19820 + }, + { + "epoch": 0.566976411722659, + "grad_norm": 0.39929306507110596, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0335, + "step": 19830 + }, + { + "epoch": 0.5672623302358828, + "grad_norm": 0.6819440722465515, + "learning_rate": 6.923644220932124e-06, + "loss": 0.0338, + "step": 19840 + }, + { + "epoch": 0.5675482487491065, + "grad_norm": 0.7612042427062988, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0345, + "step": 19850 + }, + { + "epoch": 0.5678341672623303, + "grad_norm": 0.472676545381546, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0388, + "step": 19860 + }, + { + "epoch": 0.568120085775554, + "grad_norm": 0.48102107644081116, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0304, + "step": 19870 + }, + { + "epoch": 0.5684060042887777, + "grad_norm": 0.4174644649028778, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0315, + "step": 19880 + }, + { + "epoch": 0.5686919228020014, + "grad_norm": 0.4218151271343231, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0413, + "step": 19890 + }, + { + "epoch": 0.5689778413152251, + "grad_norm": 0.8243978023529053, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0399, + "step": 19900 + }, + { + "epoch": 0.5692637598284489, + "grad_norm": 0.400924414396286, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0298, + "step": 19910 + }, + { + "epoch": 0.5695496783416726, + "grad_norm": 0.5199277400970459, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0351, + "step": 19920 + }, + { + "epoch": 0.5698355968548964, + "grad_norm": 0.5238781571388245, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0374, + "step": 19930 + }, + { + "epoch": 0.5701215153681201, + "grad_norm": 0.7451756596565247, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0378, + "step": 19940 + }, + { + "epoch": 0.5704074338813439, + "grad_norm": 0.5029926300048828, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0391, + "step": 19950 + }, + { + "epoch": 0.5706933523945675, + "grad_norm": 0.5532147884368896, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0383, + "step": 19960 + }, + { + "epoch": 0.5709792709077913, + "grad_norm": 0.5694131851196289, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0364, + "step": 19970 + }, + { + "epoch": 0.571265189421015, + "grad_norm": 0.5066515803337097, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0363, + "step": 19980 + }, + { + "epoch": 0.5715511079342387, + "grad_norm": 0.5676470398902893, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0327, + "step": 19990 + }, + { + "epoch": 0.5718370264474625, + "grad_norm": 0.37414318323135376, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0395, + "step": 20000 + }, + { + "epoch": 0.5721229449606862, + "grad_norm": 0.5888793468475342, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0372, + "step": 20010 + }, + { + "epoch": 0.57240886347391, + "grad_norm": 0.6593262553215027, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0329, + "step": 20020 + }, + { + "epoch": 0.5726947819871336, + "grad_norm": 0.6382879614830017, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0286, + "step": 20030 + }, + { + "epoch": 0.5729807005003574, + "grad_norm": 0.6364927887916565, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0383, + "step": 20040 + }, + { + "epoch": 0.5732666190135811, + "grad_norm": 0.4102194011211395, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0342, + "step": 20050 + }, + { + "epoch": 0.5735525375268049, + "grad_norm": 0.6449235081672668, + "learning_rate": 6.733587654719298e-06, + "loss": 0.0315, + "step": 20060 + }, + { + "epoch": 0.5738384560400286, + "grad_norm": 0.708431601524353, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0316, + "step": 20070 + }, + { + "epoch": 0.5741243745532523, + "grad_norm": 0.46444272994995117, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0352, + "step": 20080 + }, + { + "epoch": 0.5744102930664761, + "grad_norm": 0.7026715278625488, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0337, + "step": 20090 + }, + { + "epoch": 0.5746962115796997, + "grad_norm": 0.43397894501686096, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0303, + "step": 20100 + }, + { + "epoch": 0.5749821300929235, + "grad_norm": 0.4937734305858612, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0403, + "step": 20110 + }, + { + "epoch": 0.5752680486061472, + "grad_norm": 0.5981410145759583, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0375, + "step": 20120 + }, + { + "epoch": 0.575553967119371, + "grad_norm": 0.5616198778152466, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0314, + "step": 20130 + }, + { + "epoch": 0.5758398856325947, + "grad_norm": 0.35028502345085144, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0367, + "step": 20140 + }, + { + "epoch": 0.5761258041458185, + "grad_norm": 0.3556109666824341, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0308, + "step": 20150 + }, + { + "epoch": 0.5764117226590422, + "grad_norm": 0.579409658908844, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0344, + "step": 20160 + }, + { + "epoch": 0.5766976411722659, + "grad_norm": 0.4484683573246002, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0312, + "step": 20170 + }, + { + "epoch": 0.5769835596854896, + "grad_norm": 0.3636038899421692, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0337, + "step": 20180 + }, + { + "epoch": 0.5772694781987133, + "grad_norm": 0.6667287349700928, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0343, + "step": 20190 + }, + { + "epoch": 0.5775553967119371, + "grad_norm": 0.26031574606895447, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0303, + "step": 20200 + }, + { + "epoch": 0.5778413152251608, + "grad_norm": 0.6683355569839478, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0316, + "step": 20210 + }, + { + "epoch": 0.5781272337383846, + "grad_norm": 0.4097786843776703, + "learning_rate": 6.596880604028027e-06, + "loss": 0.0346, + "step": 20220 + }, + { + "epoch": 0.5784131522516083, + "grad_norm": 0.45405757427215576, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0345, + "step": 20230 + }, + { + "epoch": 0.5786990707648321, + "grad_norm": 0.28291839361190796, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0323, + "step": 20240 + }, + { + "epoch": 0.5789849892780558, + "grad_norm": 0.5656186938285828, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0299, + "step": 20250 + }, + { + "epoch": 0.5792709077912794, + "grad_norm": 0.6780310869216919, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0309, + "step": 20260 + }, + { + "epoch": 0.5795568263045032, + "grad_norm": 0.3968813121318817, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0347, + "step": 20270 + }, + { + "epoch": 0.5798427448177269, + "grad_norm": 0.6598440408706665, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0329, + "step": 20280 + }, + { + "epoch": 0.5801286633309507, + "grad_norm": 0.4988970458507538, + "learning_rate": 6.53748481975927e-06, + "loss": 0.038, + "step": 20290 + }, + { + "epoch": 0.5804145818441744, + "grad_norm": 0.8016706705093384, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0358, + "step": 20300 + }, + { + "epoch": 0.5807005003573982, + "grad_norm": 0.8367684483528137, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0354, + "step": 20310 + }, + { + "epoch": 0.5809864188706219, + "grad_norm": 0.5730129480361938, + "learning_rate": 6.512107839793337e-06, + "loss": 0.0421, + "step": 20320 + }, + { + "epoch": 0.5812723373838456, + "grad_norm": 0.43631577491760254, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0361, + "step": 20330 + }, + { + "epoch": 0.5815582558970693, + "grad_norm": 0.7001264691352844, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0355, + "step": 20340 + }, + { + "epoch": 0.581844174410293, + "grad_norm": 0.4988951086997986, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0387, + "step": 20350 + }, + { + "epoch": 0.5821300929235168, + "grad_norm": 0.45731016993522644, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0398, + "step": 20360 + }, + { + "epoch": 0.5824160114367405, + "grad_norm": 0.38684406876564026, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0345, + "step": 20370 + }, + { + "epoch": 0.5827019299499643, + "grad_norm": 0.3924580514431, + "learning_rate": 6.461496350649529e-06, + "loss": 0.037, + "step": 20380 + }, + { + "epoch": 0.582987848463188, + "grad_norm": 0.43735265731811523, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0371, + "step": 20390 + }, + { + "epoch": 0.5832737669764118, + "grad_norm": 0.4595138430595398, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0337, + "step": 20400 + }, + { + "epoch": 0.5835596854896354, + "grad_norm": 0.429569810628891, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0284, + "step": 20410 + }, + { + "epoch": 0.5838456040028592, + "grad_norm": 0.5399166345596313, + "learning_rate": 6.427861749601945e-06, + "loss": 0.0353, + "step": 20420 + }, + { + "epoch": 0.5841315225160829, + "grad_norm": 0.5698734521865845, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0361, + "step": 20430 + }, + { + "epoch": 0.5844174410293066, + "grad_norm": 0.35422587394714355, + "learning_rate": 6.411076603575166e-06, + "loss": 0.033, + "step": 20440 + }, + { + "epoch": 0.5847033595425304, + "grad_norm": 0.4475875198841095, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0344, + "step": 20450 + }, + { + "epoch": 0.5849892780557541, + "grad_norm": 0.4950159192085266, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0428, + "step": 20460 + }, + { + "epoch": 0.5852751965689779, + "grad_norm": 0.695249617099762, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0354, + "step": 20470 + }, + { + "epoch": 0.5855611150822015, + "grad_norm": 0.2538593113422394, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.0383, + "step": 20480 + }, + { + "epoch": 0.5858470335954253, + "grad_norm": 0.6770910024642944, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0364, + "step": 20490 + }, + { + "epoch": 0.586132952108649, + "grad_norm": 0.7187057733535767, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0319, + "step": 20500 + }, + { + "epoch": 0.5864188706218728, + "grad_norm": 0.34853193163871765, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.033, + "step": 20510 + }, + { + "epoch": 0.5867047891350965, + "grad_norm": 0.8484768271446228, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0331, + "step": 20520 + }, + { + "epoch": 0.5869907076483202, + "grad_norm": 0.6645244359970093, + "learning_rate": 6.335811156758245e-06, + "loss": 0.0353, + "step": 20530 + }, + { + "epoch": 0.587276626161544, + "grad_norm": 0.5094996690750122, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0374, + "step": 20540 + }, + { + "epoch": 0.5875625446747677, + "grad_norm": 0.5012859106063843, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0329, + "step": 20550 + }, + { + "epoch": 0.5878484631879914, + "grad_norm": 0.6465861797332764, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0282, + "step": 20560 + }, + { + "epoch": 0.5881343817012151, + "grad_norm": 0.5694834589958191, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0313, + "step": 20570 + }, + { + "epoch": 0.5884203002144389, + "grad_norm": 0.4945555627346039, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0353, + "step": 20580 + }, + { + "epoch": 0.5887062187276626, + "grad_norm": 0.5606586933135986, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0343, + "step": 20590 + }, + { + "epoch": 0.5889921372408864, + "grad_norm": 0.6913802027702332, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0358, + "step": 20600 + }, + { + "epoch": 0.5892780557541101, + "grad_norm": 0.8119901418685913, + "learning_rate": 6.269280523549298e-06, + "loss": 0.038, + "step": 20610 + }, + { + "epoch": 0.5895639742673338, + "grad_norm": 0.5558752417564392, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0311, + "step": 20620 + }, + { + "epoch": 0.5898498927805575, + "grad_norm": 0.45028987526893616, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0321, + "step": 20630 + }, + { + "epoch": 0.5901358112937812, + "grad_norm": 0.3697125017642975, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0331, + "step": 20640 + }, + { + "epoch": 0.590421729807005, + "grad_norm": 0.5406038761138916, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0445, + "step": 20650 + }, + { + "epoch": 0.5907076483202287, + "grad_norm": 0.4301048219203949, + "learning_rate": 6.227878992893104e-06, + "loss": 0.0371, + "step": 20660 + }, + { + "epoch": 0.5909935668334525, + "grad_norm": 0.6343403458595276, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0353, + "step": 20670 + }, + { + "epoch": 0.5912794853466762, + "grad_norm": 0.4666310250759125, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0352, + "step": 20680 + }, + { + "epoch": 0.5915654038599, + "grad_norm": 0.7471063733100891, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0352, + "step": 20690 + }, + { + "epoch": 0.5918513223731237, + "grad_norm": 0.9971692562103271, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0327, + "step": 20700 + }, + { + "epoch": 0.5921372408863473, + "grad_norm": 0.5646237134933472, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0365, + "step": 20710 + }, + { + "epoch": 0.5924231593995711, + "grad_norm": 0.46781328320503235, + "learning_rate": 6.17838207381795e-06, + "loss": 0.042, + "step": 20720 + }, + { + "epoch": 0.5927090779127948, + "grad_norm": 0.7061547040939331, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0484, + "step": 20730 + }, + { + "epoch": 0.5929949964260186, + "grad_norm": 0.6651175618171692, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0353, + "step": 20740 + }, + { + "epoch": 0.5932809149392423, + "grad_norm": 0.5959596037864685, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0344, + "step": 20750 + }, + { + "epoch": 0.5935668334524661, + "grad_norm": 0.5869056582450867, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0389, + "step": 20760 + }, + { + "epoch": 0.5938527519656898, + "grad_norm": 0.42101356387138367, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0288, + "step": 20770 + }, + { + "epoch": 0.5941386704789136, + "grad_norm": 0.6310023069381714, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0362, + "step": 20780 + }, + { + "epoch": 0.5944245889921372, + "grad_norm": 0.6737013459205627, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0377, + "step": 20790 + }, + { + "epoch": 0.5947105075053609, + "grad_norm": 0.6716046333312988, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0415, + "step": 20800 + }, + { + "epoch": 0.5949964260185847, + "grad_norm": 0.9742669463157654, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0337, + "step": 20810 + }, + { + "epoch": 0.5952823445318084, + "grad_norm": 0.571782648563385, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0362, + "step": 20820 + }, + { + "epoch": 0.5955682630450322, + "grad_norm": 0.9673911333084106, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0362, + "step": 20830 + }, + { + "epoch": 0.5958541815582559, + "grad_norm": 0.5391695499420166, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0331, + "step": 20840 + }, + { + "epoch": 0.5961401000714797, + "grad_norm": 1.4766349792480469, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0332, + "step": 20850 + }, + { + "epoch": 0.5964260185847033, + "grad_norm": 0.6329004168510437, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0375, + "step": 20860 + }, + { + "epoch": 0.5967119370979271, + "grad_norm": 0.6745501160621643, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0347, + "step": 20870 + }, + { + "epoch": 0.5969978556111508, + "grad_norm": 0.3006536364555359, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0321, + "step": 20880 + }, + { + "epoch": 0.5972837741243745, + "grad_norm": 0.4666125476360321, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0363, + "step": 20890 + }, + { + "epoch": 0.5975696926375983, + "grad_norm": 0.3881456255912781, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0318, + "step": 20900 + }, + { + "epoch": 0.597855611150822, + "grad_norm": 0.4211449921131134, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0357, + "step": 20910 + }, + { + "epoch": 0.5981415296640458, + "grad_norm": 1.125683307647705, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0364, + "step": 20920 + }, + { + "epoch": 0.5984274481772694, + "grad_norm": 0.9670853614807129, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0385, + "step": 20930 + }, + { + "epoch": 0.5987133666904932, + "grad_norm": 0.7302138209342957, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0321, + "step": 20940 + }, + { + "epoch": 0.5989992852037169, + "grad_norm": 0.7883613109588623, + "learning_rate": 5.990549152010853e-06, + "loss": 0.038, + "step": 20950 + }, + { + "epoch": 0.5992852037169407, + "grad_norm": 0.44051188230514526, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0423, + "step": 20960 + }, + { + "epoch": 0.5995711222301644, + "grad_norm": 0.5225116014480591, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0293, + "step": 20970 + }, + { + "epoch": 0.5998570407433881, + "grad_norm": 0.44672495126724243, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0314, + "step": 20980 + }, + { + "epoch": 0.6001429592566119, + "grad_norm": 0.4489240050315857, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0298, + "step": 20990 + }, + { + "epoch": 0.6004288777698356, + "grad_norm": 0.3942757844924927, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0323, + "step": 21000 + }, + { + "epoch": 0.6007147962830593, + "grad_norm": 0.5079668760299683, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0435, + "step": 21010 + }, + { + "epoch": 0.601000714796283, + "grad_norm": 0.5057359933853149, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0364, + "step": 21020 + }, + { + "epoch": 0.6012866333095068, + "grad_norm": 0.4823545515537262, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0408, + "step": 21030 + }, + { + "epoch": 0.6015725518227305, + "grad_norm": 0.42647498846054077, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0366, + "step": 21040 + }, + { + "epoch": 0.6018584703359543, + "grad_norm": 0.5967830419540405, + "learning_rate": 5.909845843697164e-06, + "loss": 0.037, + "step": 21050 + }, + { + "epoch": 0.602144388849178, + "grad_norm": 0.4567292034626007, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0306, + "step": 21060 + }, + { + "epoch": 0.6024303073624017, + "grad_norm": 0.6767273545265198, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0447, + "step": 21070 + }, + { + "epoch": 0.6027162258756255, + "grad_norm": 0.2957002520561218, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0339, + "step": 21080 + }, + { + "epoch": 0.6030021443888491, + "grad_norm": 0.6870969533920288, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0313, + "step": 21090 + }, + { + "epoch": 0.6032880629020729, + "grad_norm": 0.530910313129425, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0377, + "step": 21100 + }, + { + "epoch": 0.6035739814152966, + "grad_norm": 0.21370625495910645, + "learning_rate": 5.86170998451151e-06, + "loss": 0.032, + "step": 21110 + }, + { + "epoch": 0.6038598999285204, + "grad_norm": 0.6039503812789917, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0258, + "step": 21120 + }, + { + "epoch": 0.6041458184417441, + "grad_norm": 0.5375682711601257, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0341, + "step": 21130 + }, + { + "epoch": 0.6044317369549679, + "grad_norm": 0.4819096326828003, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0309, + "step": 21140 + }, + { + "epoch": 0.6047176554681916, + "grad_norm": 0.31165415048599243, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0278, + "step": 21150 + }, + { + "epoch": 0.6050035739814152, + "grad_norm": 0.2781001925468445, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0342, + "step": 21160 + }, + { + "epoch": 0.605289492494639, + "grad_norm": 0.44726037979125977, + "learning_rate": 5.813791207086085e-06, + "loss": 0.032, + "step": 21170 + }, + { + "epoch": 0.6055754110078627, + "grad_norm": 0.5762766599655151, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0325, + "step": 21180 + }, + { + "epoch": 0.6058613295210865, + "grad_norm": 0.49829939007759094, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0322, + "step": 21190 + }, + { + "epoch": 0.6061472480343102, + "grad_norm": 0.4683297276496887, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0369, + "step": 21200 + }, + { + "epoch": 0.606433166547534, + "grad_norm": 0.662159264087677, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0278, + "step": 21210 + }, + { + "epoch": 0.6067190850607577, + "grad_norm": 0.4397001564502716, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0366, + "step": 21220 + }, + { + "epoch": 0.6070050035739815, + "grad_norm": 0.4977007508277893, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0293, + "step": 21230 + }, + { + "epoch": 0.6072909220872051, + "grad_norm": 0.3705490827560425, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0315, + "step": 21240 + }, + { + "epoch": 0.6075768406004288, + "grad_norm": 0.6350240111351013, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0286, + "step": 21250 + }, + { + "epoch": 0.6078627591136526, + "grad_norm": 0.5590423941612244, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0373, + "step": 21260 + }, + { + "epoch": 0.6081486776268763, + "grad_norm": 0.5244049429893494, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0325, + "step": 21270 + }, + { + "epoch": 0.6084345961401001, + "grad_norm": 1.082044005393982, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0373, + "step": 21280 + }, + { + "epoch": 0.6087205146533238, + "grad_norm": 0.614028811454773, + "learning_rate": 5.71861298612245e-06, + "loss": 0.031, + "step": 21290 + }, + { + "epoch": 0.6090064331665476, + "grad_norm": 0.783205509185791, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0289, + "step": 21300 + }, + { + "epoch": 0.6092923516797712, + "grad_norm": 0.5420807600021362, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.031, + "step": 21310 + }, + { + "epoch": 0.609578270192995, + "grad_norm": 0.42979222536087036, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0291, + "step": 21320 + }, + { + "epoch": 0.6098641887062187, + "grad_norm": 0.44511356949806213, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.031, + "step": 21330 + }, + { + "epoch": 0.6101501072194424, + "grad_norm": 0.528799831867218, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0269, + "step": 21340 + }, + { + "epoch": 0.6104360257326662, + "grad_norm": 0.43274471163749695, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0438, + "step": 21350 + }, + { + "epoch": 0.6107219442458899, + "grad_norm": 0.8020172715187073, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0393, + "step": 21360 + }, + { + "epoch": 0.6110078627591137, + "grad_norm": 0.4354296028614044, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0338, + "step": 21370 + }, + { + "epoch": 0.6112937812723374, + "grad_norm": 0.587364673614502, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0359, + "step": 21380 + }, + { + "epoch": 0.6115796997855611, + "grad_norm": 0.5426310300827026, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0333, + "step": 21390 + }, + { + "epoch": 0.6118656182987848, + "grad_norm": 0.5900459289550781, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0344, + "step": 21400 + }, + { + "epoch": 0.6121515368120086, + "grad_norm": 0.5652357935905457, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0396, + "step": 21410 + }, + { + "epoch": 0.6124374553252323, + "grad_norm": 0.5287114977836609, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0387, + "step": 21420 + }, + { + "epoch": 0.612723373838456, + "grad_norm": 0.7939184904098511, + "learning_rate": 5.608700869895367e-06, + "loss": 0.0351, + "step": 21430 + }, + { + "epoch": 0.6130092923516798, + "grad_norm": 0.6840642094612122, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0349, + "step": 21440 + }, + { + "epoch": 0.6132952108649035, + "grad_norm": 0.3717428147792816, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0336, + "step": 21450 + }, + { + "epoch": 0.6135811293781273, + "grad_norm": 0.5073713064193726, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0326, + "step": 21460 + }, + { + "epoch": 0.6138670478913509, + "grad_norm": 1.1579232215881348, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0388, + "step": 21470 + }, + { + "epoch": 0.6141529664045747, + "grad_norm": 0.4209369122982025, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0307, + "step": 21480 + }, + { + "epoch": 0.6144388849177984, + "grad_norm": 0.38663822412490845, + "learning_rate": 5.561973825289734e-06, + "loss": 0.037, + "step": 21490 + }, + { + "epoch": 0.6147248034310222, + "grad_norm": 0.538270890712738, + "learning_rate": 5.554208267666996e-06, + "loss": 0.0333, + "step": 21500 + }, + { + "epoch": 0.6150107219442459, + "grad_norm": 0.28280535340309143, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0319, + "step": 21510 + }, + { + "epoch": 0.6152966404574696, + "grad_norm": 0.5407803058624268, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0352, + "step": 21520 + }, + { + "epoch": 0.6155825589706934, + "grad_norm": 1.4600974321365356, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0409, + "step": 21530 + }, + { + "epoch": 0.615868477483917, + "grad_norm": 0.659900426864624, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0322, + "step": 21540 + }, + { + "epoch": 0.6161543959971408, + "grad_norm": 0.6401934623718262, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0339, + "step": 21550 + }, + { + "epoch": 0.6164403145103645, + "grad_norm": 0.6409866213798523, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0331, + "step": 21560 + }, + { + "epoch": 0.6167262330235883, + "grad_norm": 0.6627630591392517, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0332, + "step": 21570 + }, + { + "epoch": 0.617012151536812, + "grad_norm": 0.6180721521377563, + "learning_rate": 5.492314644463202e-06, + "loss": 0.0327, + "step": 21580 + }, + { + "epoch": 0.6172980700500358, + "grad_norm": 0.4689866006374359, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0276, + "step": 21590 + }, + { + "epoch": 0.6175839885632595, + "grad_norm": 0.5039265751838684, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0427, + "step": 21600 + }, + { + "epoch": 0.6178699070764831, + "grad_norm": 0.5313833355903625, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0351, + "step": 21610 + }, + { + "epoch": 0.6181558255897069, + "grad_norm": 0.4919044077396393, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0327, + "step": 21620 + }, + { + "epoch": 0.6184417441029306, + "grad_norm": 0.5446444153785706, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0331, + "step": 21630 + }, + { + "epoch": 0.6187276626161544, + "grad_norm": 0.5198109745979309, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.032, + "step": 21640 + }, + { + "epoch": 0.6190135811293781, + "grad_norm": 0.5684625506401062, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0339, + "step": 21650 + }, + { + "epoch": 0.6192994996426019, + "grad_norm": 0.6882810592651367, + "learning_rate": 5.430834687545416e-06, + "loss": 0.035, + "step": 21660 + }, + { + "epoch": 0.6195854181558256, + "grad_norm": 0.7360101938247681, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0306, + "step": 21670 + }, + { + "epoch": 0.6198713366690494, + "grad_norm": 0.5557180047035217, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0241, + "step": 21680 + }, + { + "epoch": 0.620157255182273, + "grad_norm": 0.4302096962928772, + "learning_rate": 5.407887295494495e-06, + "loss": 0.035, + "step": 21690 + }, + { + "epoch": 0.6204431736954967, + "grad_norm": 0.4740016460418701, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0331, + "step": 21700 + }, + { + "epoch": 0.6207290922087205, + "grad_norm": 0.5400598049163818, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0297, + "step": 21710 + }, + { + "epoch": 0.6210150107219442, + "grad_norm": 0.4270641803741455, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0334, + "step": 21720 + }, + { + "epoch": 0.621300929235168, + "grad_norm": 0.41063550114631653, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0375, + "step": 21730 + }, + { + "epoch": 0.6215868477483917, + "grad_norm": 0.48556044697761536, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0291, + "step": 21740 + }, + { + "epoch": 0.6218727662616155, + "grad_norm": 0.2872731387615204, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0323, + "step": 21750 + }, + { + "epoch": 0.6221586847748392, + "grad_norm": 0.4088454246520996, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0307, + "step": 21760 + }, + { + "epoch": 0.622444603288063, + "grad_norm": 0.42600440979003906, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.0326, + "step": 21770 + }, + { + "epoch": 0.6227305218012866, + "grad_norm": 0.36466315388679504, + "learning_rate": 5.339400468833427e-06, + "loss": 0.0337, + "step": 21780 + }, + { + "epoch": 0.6230164403145103, + "grad_norm": 0.588921308517456, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0336, + "step": 21790 + }, + { + "epoch": 0.6233023588277341, + "grad_norm": 0.44768571853637695, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0326, + "step": 21800 + }, + { + "epoch": 0.6235882773409578, + "grad_norm": 1.1612637042999268, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0355, + "step": 21810 + }, + { + "epoch": 0.6238741958541816, + "grad_norm": 1.0912114381790161, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0436, + "step": 21820 + }, + { + "epoch": 0.6241601143674053, + "grad_norm": 0.5813164710998535, + "learning_rate": 5.301584321328435e-06, + "loss": 0.034, + "step": 21830 + }, + { + "epoch": 0.624446032880629, + "grad_norm": 0.45064911246299744, + "learning_rate": 5.294041118587667e-06, + "loss": 0.032, + "step": 21840 + }, + { + "epoch": 0.6247319513938527, + "grad_norm": 0.5173943638801575, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0322, + "step": 21850 + }, + { + "epoch": 0.6250178699070765, + "grad_norm": 0.41157352924346924, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0319, + "step": 21860 + }, + { + "epoch": 0.6253037884203002, + "grad_norm": 0.5711286067962646, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0288, + "step": 21870 + }, + { + "epoch": 0.6255897069335239, + "grad_norm": 0.5108116865158081, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0325, + "step": 21880 + }, + { + "epoch": 0.6258756254467477, + "grad_norm": 0.49562424421310425, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0306, + "step": 21890 + }, + { + "epoch": 0.6261615439599714, + "grad_norm": 0.3392108976840973, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0358, + "step": 21900 + }, + { + "epoch": 0.6264474624731952, + "grad_norm": 1.0588114261627197, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0404, + "step": 21910 + }, + { + "epoch": 0.6267333809864188, + "grad_norm": 0.6979959607124329, + "learning_rate": 5.233937303988081e-06, + "loss": 0.0349, + "step": 21920 + }, + { + "epoch": 0.6270192994996426, + "grad_norm": 0.3185918927192688, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0276, + "step": 21930 + }, + { + "epoch": 0.6273052180128663, + "grad_norm": 0.3921501338481903, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0326, + "step": 21940 + }, + { + "epoch": 0.6275911365260901, + "grad_norm": 0.9666212797164917, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0346, + "step": 21950 + }, + { + "epoch": 0.6278770550393138, + "grad_norm": 0.4483211040496826, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.0306, + "step": 21960 + }, + { + "epoch": 0.6281629735525375, + "grad_norm": 0.4839077293872833, + "learning_rate": 5.196592054173714e-06, + "loss": 0.026, + "step": 21970 + }, + { + "epoch": 0.6284488920657613, + "grad_norm": 0.5054528117179871, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0299, + "step": 21980 + }, + { + "epoch": 0.628734810578985, + "grad_norm": 0.5953076481819153, + "learning_rate": 5.181701567303612e-06, + "loss": 0.036, + "step": 21990 + }, + { + "epoch": 0.6290207290922087, + "grad_norm": 0.39300060272216797, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0358, + "step": 22000 + }, + { + "epoch": 0.6293066476054324, + "grad_norm": 0.42864665389060974, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0363, + "step": 22010 + }, + { + "epoch": 0.6295925661186562, + "grad_norm": 0.33609238266944885, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0398, + "step": 22020 + }, + { + "epoch": 0.6298784846318799, + "grad_norm": 0.4237107038497925, + "learning_rate": 5.152002600477859e-06, + "loss": 0.0319, + "step": 22030 + }, + { + "epoch": 0.6301644031451037, + "grad_norm": 0.42774054408073425, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0332, + "step": 22040 + }, + { + "epoch": 0.6304503216583274, + "grad_norm": 0.8992825150489807, + "learning_rate": 5.137194259935739e-06, + "loss": 0.0396, + "step": 22050 + }, + { + "epoch": 0.630736240171551, + "grad_norm": 0.20832861959934235, + "learning_rate": 5.129800405815733e-06, + "loss": 0.03, + "step": 22060 + }, + { + "epoch": 0.6310221586847748, + "grad_norm": 0.5961321592330933, + "learning_rate": 5.122413440701921e-06, + "loss": 0.0429, + "step": 22070 + }, + { + "epoch": 0.6313080771979985, + "grad_norm": 0.5037736296653748, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0332, + "step": 22080 + }, + { + "epoch": 0.6315939957112223, + "grad_norm": 0.383732408285141, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0293, + "step": 22090 + }, + { + "epoch": 0.631879914224446, + "grad_norm": 0.8124368786811829, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0286, + "step": 22100 + }, + { + "epoch": 0.6321658327376698, + "grad_norm": 0.96833735704422, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0346, + "step": 22110 + }, + { + "epoch": 0.6324517512508935, + "grad_norm": 0.42382001876831055, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0345, + "step": 22120 + }, + { + "epoch": 0.6327376697641173, + "grad_norm": 0.5928776860237122, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0314, + "step": 22130 + }, + { + "epoch": 0.633023588277341, + "grad_norm": 0.7822670340538025, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0335, + "step": 22140 + }, + { + "epoch": 0.6333095067905646, + "grad_norm": 0.6383520364761353, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0321, + "step": 22150 + }, + { + "epoch": 0.6335954253037884, + "grad_norm": 0.3413240611553192, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0314, + "step": 22160 + }, + { + "epoch": 0.6338813438170121, + "grad_norm": 0.5960783958435059, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0385, + "step": 22170 + }, + { + "epoch": 0.6341672623302359, + "grad_norm": 0.2557702660560608, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0316, + "step": 22180 + }, + { + "epoch": 0.6344531808434596, + "grad_norm": 0.6229982376098633, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0325, + "step": 22190 + }, + { + "epoch": 0.6347390993566834, + "grad_norm": 0.5080077052116394, + "learning_rate": 5.027013727107874e-06, + "loss": 0.036, + "step": 22200 + }, + { + "epoch": 0.6350250178699071, + "grad_norm": 0.5630851984024048, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0279, + "step": 22210 + }, + { + "epoch": 0.6353109363831309, + "grad_norm": 0.81584233045578, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0343, + "step": 22220 + }, + { + "epoch": 0.6355968548963545, + "grad_norm": 0.3985321521759033, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0289, + "step": 22230 + }, + { + "epoch": 0.6358827734095782, + "grad_norm": 0.4481184482574463, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0345, + "step": 22240 + }, + { + "epoch": 0.636168691922802, + "grad_norm": 0.3640075623989105, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0307, + "step": 22250 + }, + { + "epoch": 0.6364546104360257, + "grad_norm": 0.4006771147251129, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0381, + "step": 22260 + }, + { + "epoch": 0.6367405289492495, + "grad_norm": 0.7638134360313416, + "learning_rate": 4.976134120528886e-06, + "loss": 0.039, + "step": 22270 + }, + { + "epoch": 0.6370264474624732, + "grad_norm": 0.4820837080478668, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0281, + "step": 22280 + }, + { + "epoch": 0.637312365975697, + "grad_norm": 0.5928444266319275, + "learning_rate": 4.961660586405147e-06, + "loss": 0.033, + "step": 22290 + }, + { + "epoch": 0.6375982844889206, + "grad_norm": 0.50687575340271, + "learning_rate": 4.954434444590436e-06, + "loss": 0.0357, + "step": 22300 + }, + { + "epoch": 0.6378842030021444, + "grad_norm": 0.673939049243927, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0301, + "step": 22310 + }, + { + "epoch": 0.6381701215153681, + "grad_norm": 0.4300031065940857, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.029, + "step": 22320 + }, + { + "epoch": 0.6384560400285918, + "grad_norm": 0.6585102677345276, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0333, + "step": 22330 + }, + { + "epoch": 0.6387419585418156, + "grad_norm": 0.6430448889732361, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0345, + "step": 22340 + }, + { + "epoch": 0.6390278770550393, + "grad_norm": 0.8272712826728821, + "learning_rate": 4.918410326949594e-06, + "loss": 0.034, + "step": 22350 + }, + { + "epoch": 0.6393137955682631, + "grad_norm": 0.7631726861000061, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0321, + "step": 22360 + }, + { + "epoch": 0.6395997140814867, + "grad_norm": 0.5562252402305603, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0339, + "step": 22370 + }, + { + "epoch": 0.6398856325947105, + "grad_norm": 0.6027814149856567, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0355, + "step": 22380 + }, + { + "epoch": 0.6401715511079342, + "grad_norm": 0.3548984229564667, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0343, + "step": 22390 + }, + { + "epoch": 0.640457469621158, + "grad_norm": 0.4959709346294403, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.031, + "step": 22400 + }, + { + "epoch": 0.6407433881343817, + "grad_norm": 0.3765028715133667, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.0406, + "step": 22410 + }, + { + "epoch": 0.6410293066476054, + "grad_norm": 0.5014662146568298, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0297, + "step": 22420 + }, + { + "epoch": 0.6413152251608292, + "grad_norm": 0.5085675716400146, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0324, + "step": 22430 + }, + { + "epoch": 0.6416011436740529, + "grad_norm": 0.37595826387405396, + "learning_rate": 4.854017257346105e-06, + "loss": 0.033, + "step": 22440 + }, + { + "epoch": 0.6418870621872766, + "grad_norm": 0.5408678650856018, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0323, + "step": 22450 + }, + { + "epoch": 0.6421729807005003, + "grad_norm": 0.4319652020931244, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0269, + "step": 22460 + }, + { + "epoch": 0.6424588992137241, + "grad_norm": 0.41388124227523804, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0311, + "step": 22470 + }, + { + "epoch": 0.6427448177269478, + "grad_norm": 0.4778555631637573, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0332, + "step": 22480 + }, + { + "epoch": 0.6430307362401716, + "grad_norm": 0.38835474848747253, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.0304, + "step": 22490 + }, + { + "epoch": 0.6433166547533953, + "grad_norm": 0.5165611505508423, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0344, + "step": 22500 + }, + { + "epoch": 0.643602573266619, + "grad_norm": 0.4285198450088501, + "learning_rate": 4.804337352679613e-06, + "loss": 0.035, + "step": 22510 + }, + { + "epoch": 0.6438884917798428, + "grad_norm": 0.4512922167778015, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0354, + "step": 22520 + }, + { + "epoch": 0.6441744102930664, + "grad_norm": 0.33437663316726685, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0343, + "step": 22530 + }, + { + "epoch": 0.6444603288062902, + "grad_norm": 0.45291104912757874, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0307, + "step": 22540 + }, + { + "epoch": 0.6447462473195139, + "grad_norm": 0.5920093655586243, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0338, + "step": 22550 + }, + { + "epoch": 0.6450321658327377, + "grad_norm": 0.6362392902374268, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0335, + "step": 22560 + }, + { + "epoch": 0.6453180843459614, + "grad_norm": 0.28033652901649475, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0267, + "step": 22570 + }, + { + "epoch": 0.6456040028591852, + "grad_norm": 0.4563148617744446, + "learning_rate": 4.755013723146175e-06, + "loss": 0.0316, + "step": 22580 + }, + { + "epoch": 0.6458899213724089, + "grad_norm": 0.4889507591724396, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.034, + "step": 22590 + }, + { + "epoch": 0.6461758398856325, + "grad_norm": 0.6826061010360718, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0316, + "step": 22600 + }, + { + "epoch": 0.6464617583988563, + "grad_norm": 0.45066431164741516, + "learning_rate": 4.733984792194363e-06, + "loss": 0.0287, + "step": 22610 + }, + { + "epoch": 0.64674767691208, + "grad_norm": 0.41994187235832214, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0312, + "step": 22620 + }, + { + "epoch": 0.6470335954253038, + "grad_norm": 0.39731675386428833, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0347, + "step": 22630 + }, + { + "epoch": 0.6473195139385275, + "grad_norm": 0.5207498073577881, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0304, + "step": 22640 + }, + { + "epoch": 0.6476054324517513, + "grad_norm": 0.42930668592453003, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0344, + "step": 22650 + }, + { + "epoch": 0.647891350964975, + "grad_norm": 0.3023674488067627, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0359, + "step": 22660 + }, + { + "epoch": 0.6481772694781988, + "grad_norm": 0.43205010890960693, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0323, + "step": 22670 + }, + { + "epoch": 0.6484631879914224, + "grad_norm": 0.5984707474708557, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0318, + "step": 22680 + }, + { + "epoch": 0.6487491065046461, + "grad_norm": 0.43477800488471985, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0346, + "step": 22690 + }, + { + "epoch": 0.6490350250178699, + "grad_norm": 0.3570900857448578, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0342, + "step": 22700 + }, + { + "epoch": 0.6493209435310936, + "grad_norm": 0.47367945313453674, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0367, + "step": 22710 + }, + { + "epoch": 0.6496068620443174, + "grad_norm": 0.3768099844455719, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0357, + "step": 22720 + }, + { + "epoch": 0.6498927805575411, + "grad_norm": 0.6188724040985107, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0299, + "step": 22730 + }, + { + "epoch": 0.6501786990707649, + "grad_norm": 0.5733038783073425, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0343, + "step": 22740 + }, + { + "epoch": 0.6504646175839885, + "grad_norm": 0.5000156164169312, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0284, + "step": 22750 + }, + { + "epoch": 0.6507505360972123, + "grad_norm": 0.22813546657562256, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0288, + "step": 22760 + }, + { + "epoch": 0.651036454610436, + "grad_norm": 0.4805088937282562, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0305, + "step": 22770 + }, + { + "epoch": 0.6513223731236597, + "grad_norm": 0.4652612507343292, + "learning_rate": 4.616077433849538e-06, + "loss": 0.0304, + "step": 22780 + }, + { + "epoch": 0.6516082916368835, + "grad_norm": 0.5010579824447632, + "learning_rate": 4.609208744970524e-06, + "loss": 0.0337, + "step": 22790 + }, + { + "epoch": 0.6518942101501072, + "grad_norm": 0.36260518431663513, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0284, + "step": 22800 + }, + { + "epoch": 0.652180128663331, + "grad_norm": 0.45098820328712463, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0297, + "step": 22810 + }, + { + "epoch": 0.6524660471765547, + "grad_norm": 0.6154504418373108, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0366, + "step": 22820 + }, + { + "epoch": 0.6527519656897784, + "grad_norm": 0.4522152543067932, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.033, + "step": 22830 + }, + { + "epoch": 0.6530378842030021, + "grad_norm": 0.34195253252983093, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0304, + "step": 22840 + }, + { + "epoch": 0.6533238027162259, + "grad_norm": 0.49787941575050354, + "learning_rate": 4.568154392147005e-06, + "loss": 0.033, + "step": 22850 + }, + { + "epoch": 0.6536097212294496, + "grad_norm": 0.5249335765838623, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0381, + "step": 22860 + }, + { + "epoch": 0.6538956397426733, + "grad_norm": 0.7645581960678101, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0298, + "step": 22870 + }, + { + "epoch": 0.6541815582558971, + "grad_norm": 0.6034232974052429, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0313, + "step": 22880 + }, + { + "epoch": 0.6544674767691208, + "grad_norm": 0.3499184846878052, + "learning_rate": 4.54093567906903e-06, + "loss": 0.036, + "step": 22890 + }, + { + "epoch": 0.6547533952823446, + "grad_norm": 0.4157135486602783, + "learning_rate": 4.534149931036931e-06, + "loss": 0.033, + "step": 22900 + }, + { + "epoch": 0.6550393137955682, + "grad_norm": 0.4563712775707245, + "learning_rate": 4.527371771040039e-06, + "loss": 0.0361, + "step": 22910 + }, + { + "epoch": 0.655325232308792, + "grad_norm": 1.080802321434021, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0307, + "step": 22920 + }, + { + "epoch": 0.6556111508220157, + "grad_norm": 0.38259357213974, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0292, + "step": 22930 + }, + { + "epoch": 0.6558970693352395, + "grad_norm": 0.6920587420463562, + "learning_rate": 4.507082898761475e-06, + "loss": 0.0322, + "step": 22940 + }, + { + "epoch": 0.6561829878484632, + "grad_norm": 0.628978967666626, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0391, + "step": 22950 + }, + { + "epoch": 0.6564689063616869, + "grad_norm": 0.4848436713218689, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0306, + "step": 22960 + }, + { + "epoch": 0.6567548248749107, + "grad_norm": 0.4478876292705536, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0334, + "step": 22970 + }, + { + "epoch": 0.6570407433881343, + "grad_norm": 0.47360673546791077, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0357, + "step": 22980 + }, + { + "epoch": 0.6573266619013581, + "grad_norm": 0.32840496301651, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0339, + "step": 22990 + }, + { + "epoch": 0.6576125804145818, + "grad_norm": 0.4047236442565918, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0321, + "step": 23000 + }, + { + "epoch": 0.6578984989278056, + "grad_norm": 0.7817053198814392, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0375, + "step": 23010 + }, + { + "epoch": 0.6581844174410293, + "grad_norm": 0.38985809683799744, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0343, + "step": 23020 + }, + { + "epoch": 0.6584703359542531, + "grad_norm": 0.45360830426216125, + "learning_rate": 4.446628604336844e-06, + "loss": 0.0287, + "step": 23030 + }, + { + "epoch": 0.6587562544674768, + "grad_norm": 0.2886345088481903, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0322, + "step": 23040 + }, + { + "epoch": 0.6590421729807004, + "grad_norm": 0.8546258211135864, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0331, + "step": 23050 + }, + { + "epoch": 0.6593280914939242, + "grad_norm": 0.48426172137260437, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0343, + "step": 23060 + }, + { + "epoch": 0.6596140100071479, + "grad_norm": 0.46379074454307556, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0296, + "step": 23070 + }, + { + "epoch": 0.6598999285203717, + "grad_norm": 0.7772185206413269, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0319, + "step": 23080 + }, + { + "epoch": 0.6601858470335954, + "grad_norm": 0.4606277644634247, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0336, + "step": 23090 + }, + { + "epoch": 0.6604717655468192, + "grad_norm": 0.43342530727386475, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0287, + "step": 23100 + }, + { + "epoch": 0.6607576840600429, + "grad_norm": 0.385151207447052, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0363, + "step": 23110 + }, + { + "epoch": 0.6610436025732667, + "grad_norm": 0.3960207998752594, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0306, + "step": 23120 + }, + { + "epoch": 0.6613295210864903, + "grad_norm": 0.41210439801216125, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0348, + "step": 23130 + }, + { + "epoch": 0.661615439599714, + "grad_norm": 0.41976168751716614, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0272, + "step": 23140 + }, + { + "epoch": 0.6619013581129378, + "grad_norm": 0.3195948004722595, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0362, + "step": 23150 + }, + { + "epoch": 0.6621872766261615, + "grad_norm": 0.7024016380310059, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0316, + "step": 23160 + }, + { + "epoch": 0.6624731951393853, + "grad_norm": 0.2894183099269867, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.0339, + "step": 23170 + }, + { + "epoch": 0.662759113652609, + "grad_norm": 0.489715576171875, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0272, + "step": 23180 + }, + { + "epoch": 0.6630450321658328, + "grad_norm": 0.3406641185283661, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0272, + "step": 23190 + }, + { + "epoch": 0.6633309506790565, + "grad_norm": 0.3647848963737488, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0337, + "step": 23200 + }, + { + "epoch": 0.6636168691922802, + "grad_norm": 0.7023333311080933, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0334, + "step": 23210 + }, + { + "epoch": 0.6639027877055039, + "grad_norm": 0.43989211320877075, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0313, + "step": 23220 + }, + { + "epoch": 0.6641887062187276, + "grad_norm": 0.7329099774360657, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0283, + "step": 23230 + }, + { + "epoch": 0.6644746247319514, + "grad_norm": 0.3954019546508789, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0321, + "step": 23240 + }, + { + "epoch": 0.6647605432451751, + "grad_norm": 0.38020703196525574, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0337, + "step": 23250 + }, + { + "epoch": 0.6650464617583989, + "grad_norm": 0.5988985300064087, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0353, + "step": 23260 + }, + { + "epoch": 0.6653323802716226, + "grad_norm": 0.4259869158267975, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0316, + "step": 23270 + }, + { + "epoch": 0.6656182987848464, + "grad_norm": 0.4322545528411865, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0318, + "step": 23280 + }, + { + "epoch": 0.66590421729807, + "grad_norm": 0.40275540947914124, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0344, + "step": 23290 + }, + { + "epoch": 0.6661901358112938, + "grad_norm": 0.5070827603340149, + "learning_rate": 4.269026084410863e-06, + "loss": 0.0336, + "step": 23300 + }, + { + "epoch": 0.6664760543245175, + "grad_norm": 0.614973247051239, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0352, + "step": 23310 + }, + { + "epoch": 0.6667619728377412, + "grad_norm": 0.4637722074985504, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0277, + "step": 23320 + }, + { + "epoch": 0.667047891350965, + "grad_norm": 0.34951677918434143, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0284, + "step": 23330 + }, + { + "epoch": 0.6673338098641887, + "grad_norm": 0.5609407424926758, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0304, + "step": 23340 + }, + { + "epoch": 0.6676197283774125, + "grad_norm": 0.44585973024368286, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0263, + "step": 23350 + }, + { + "epoch": 0.6679056468906361, + "grad_norm": 0.5311269760131836, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0311, + "step": 23360 + }, + { + "epoch": 0.6681915654038599, + "grad_norm": 0.4923100471496582, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0277, + "step": 23370 + }, + { + "epoch": 0.6684774839170836, + "grad_norm": 0.5254819989204407, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0328, + "step": 23380 + }, + { + "epoch": 0.6687634024303074, + "grad_norm": 0.47537869215011597, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0373, + "step": 23390 + }, + { + "epoch": 0.6690493209435311, + "grad_norm": 0.40087464451789856, + "learning_rate": 4.204700678381975e-06, + "loss": 0.034, + "step": 23400 + }, + { + "epoch": 0.6693352394567548, + "grad_norm": 0.5166190266609192, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0314, + "step": 23410 + }, + { + "epoch": 0.6696211579699786, + "grad_norm": 0.42874693870544434, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0279, + "step": 23420 + }, + { + "epoch": 0.6699070764832022, + "grad_norm": 0.3685651123523712, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0313, + "step": 23430 + }, + { + "epoch": 0.670192994996426, + "grad_norm": 0.5417486429214478, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.033, + "step": 23440 + }, + { + "epoch": 0.6704789135096497, + "grad_norm": 0.5764726996421814, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0368, + "step": 23450 + }, + { + "epoch": 0.6707648320228735, + "grad_norm": 0.44168850779533386, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0258, + "step": 23460 + }, + { + "epoch": 0.6710507505360972, + "grad_norm": 0.39990919828414917, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0403, + "step": 23470 + }, + { + "epoch": 0.671336669049321, + "grad_norm": 0.7526253461837769, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0334, + "step": 23480 + }, + { + "epoch": 0.6716225875625447, + "grad_norm": 0.4888451397418976, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0314, + "step": 23490 + }, + { + "epoch": 0.6719085060757684, + "grad_norm": 0.5732892751693726, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0277, + "step": 23500 + }, + { + "epoch": 0.6721944245889921, + "grad_norm": 0.5806633830070496, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0347, + "step": 23510 + }, + { + "epoch": 0.6724803431022158, + "grad_norm": 0.4336501657962799, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0369, + "step": 23520 + }, + { + "epoch": 0.6727662616154396, + "grad_norm": 0.47082582116127014, + "learning_rate": 4.122270968037107e-06, + "loss": 0.0408, + "step": 23530 + }, + { + "epoch": 0.6730521801286633, + "grad_norm": 0.6571422815322876, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0316, + "step": 23540 + }, + { + "epoch": 0.6733380986418871, + "grad_norm": 0.4899539649486542, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0256, + "step": 23550 + }, + { + "epoch": 0.6736240171551108, + "grad_norm": 0.3201868236064911, + "learning_rate": 4.103441847743051e-06, + "loss": 0.029, + "step": 23560 + }, + { + "epoch": 0.6739099356683346, + "grad_norm": 0.4385588765144348, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0284, + "step": 23570 + }, + { + "epoch": 0.6741958541815583, + "grad_norm": 0.5079174637794495, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0298, + "step": 23580 + }, + { + "epoch": 0.6744817726947819, + "grad_norm": 0.609523355960846, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0313, + "step": 23590 + }, + { + "epoch": 0.6747676912080057, + "grad_norm": 0.487690269947052, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0246, + "step": 23600 + }, + { + "epoch": 0.6750536097212294, + "grad_norm": 0.5146880745887756, + "learning_rate": 4.072221948222934e-06, + "loss": 0.0319, + "step": 23610 + }, + { + "epoch": 0.6753395282344532, + "grad_norm": 0.5848239064216614, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0282, + "step": 23620 + }, + { + "epoch": 0.6756254467476769, + "grad_norm": 0.7779616117477417, + "learning_rate": 4.05979084812184e-06, + "loss": 0.033, + "step": 23630 + }, + { + "epoch": 0.6759113652609007, + "grad_norm": 0.3329331576824188, + "learning_rate": 4.053587511509546e-06, + "loss": 0.028, + "step": 23640 + }, + { + "epoch": 0.6761972837741244, + "grad_norm": 0.4691336154937744, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0313, + "step": 23650 + }, + { + "epoch": 0.6764832022873482, + "grad_norm": 0.47258421778678894, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0326, + "step": 23660 + }, + { + "epoch": 0.6767691208005718, + "grad_norm": 0.5333718657493591, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0332, + "step": 23670 + }, + { + "epoch": 0.6770550393137955, + "grad_norm": 0.7278451323509216, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0409, + "step": 23680 + }, + { + "epoch": 0.6773409578270193, + "grad_norm": 0.41567277908325195, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0263, + "step": 23690 + }, + { + "epoch": 0.677626876340243, + "grad_norm": 0.4351106584072113, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0276, + "step": 23700 + }, + { + "epoch": 0.6779127948534668, + "grad_norm": 0.31096217036247253, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0339, + "step": 23710 + }, + { + "epoch": 0.6781987133666905, + "grad_norm": 0.6321837306022644, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0313, + "step": 23720 + }, + { + "epoch": 0.6784846318799143, + "grad_norm": 0.5278098583221436, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0294, + "step": 23730 + }, + { + "epoch": 0.6787705503931379, + "grad_norm": 0.5778757333755493, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0324, + "step": 23740 + }, + { + "epoch": 0.6790564689063617, + "grad_norm": 0.6164223551750183, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0316, + "step": 23750 + }, + { + "epoch": 0.6793423874195854, + "grad_norm": 0.2872319221496582, + "learning_rate": 3.979785400791052e-06, + "loss": 0.034, + "step": 23760 + }, + { + "epoch": 0.6796283059328091, + "grad_norm": 0.6088704466819763, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0317, + "step": 23770 + }, + { + "epoch": 0.6799142244460329, + "grad_norm": 0.4733040928840637, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0282, + "step": 23780 + }, + { + "epoch": 0.6802001429592566, + "grad_norm": 1.3417131900787354, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0304, + "step": 23790 + }, + { + "epoch": 0.6804860614724804, + "grad_norm": 0.7316146492958069, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0311, + "step": 23800 + }, + { + "epoch": 0.680771979985704, + "grad_norm": 0.5726248025894165, + "learning_rate": 3.949383948670156e-06, + "loss": 0.0323, + "step": 23810 + }, + { + "epoch": 0.6810578984989278, + "grad_norm": 0.3990941345691681, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0277, + "step": 23820 + }, + { + "epoch": 0.6813438170121515, + "grad_norm": 0.49237731099128723, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0287, + "step": 23830 + }, + { + "epoch": 0.6816297355253753, + "grad_norm": 0.47560542821884155, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0298, + "step": 23840 + }, + { + "epoch": 0.681915654038599, + "grad_norm": 0.5967867374420166, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0295, + "step": 23850 + }, + { + "epoch": 0.6822015725518227, + "grad_norm": 0.5726722478866577, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0283, + "step": 23860 + }, + { + "epoch": 0.6824874910650465, + "grad_norm": 0.282678484916687, + "learning_rate": 3.913175335139808e-06, + "loss": 0.0303, + "step": 23870 + }, + { + "epoch": 0.6827734095782702, + "grad_norm": 0.4432118237018585, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0296, + "step": 23880 + }, + { + "epoch": 0.683059328091494, + "grad_norm": 0.33677008748054504, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0379, + "step": 23890 + }, + { + "epoch": 0.6833452466047176, + "grad_norm": 0.5063587427139282, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0281, + "step": 23900 + }, + { + "epoch": 0.6836311651179414, + "grad_norm": 0.2592383921146393, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0263, + "step": 23910 + }, + { + "epoch": 0.6839170836311651, + "grad_norm": 0.4482796788215637, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0289, + "step": 23920 + }, + { + "epoch": 0.6842030021443889, + "grad_norm": 0.2609167993068695, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0294, + "step": 23930 + }, + { + "epoch": 0.6844889206576126, + "grad_norm": 0.36982619762420654, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0306, + "step": 23940 + }, + { + "epoch": 0.6847748391708363, + "grad_norm": 0.47758495807647705, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0273, + "step": 23950 + }, + { + "epoch": 0.68506075768406, + "grad_norm": 0.5566948652267456, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0318, + "step": 23960 + }, + { + "epoch": 0.6853466761972837, + "grad_norm": 0.7815461754798889, + "learning_rate": 3.853493736024934e-06, + "loss": 0.03, + "step": 23970 + }, + { + "epoch": 0.6856325947105075, + "grad_norm": 0.42888402938842773, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0384, + "step": 23980 + }, + { + "epoch": 0.6859185132237312, + "grad_norm": 0.47878748178482056, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0356, + "step": 23990 + }, + { + "epoch": 0.686204431736955, + "grad_norm": 0.3847522735595703, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0272, + "step": 24000 + }, + { + "epoch": 0.6864903502501787, + "grad_norm": 0.7005330920219421, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0314, + "step": 24010 + }, + { + "epoch": 0.6867762687634025, + "grad_norm": 0.7769733667373657, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0306, + "step": 24020 + }, + { + "epoch": 0.6870621872766262, + "grad_norm": 0.4073965847492218, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0286, + "step": 24030 + }, + { + "epoch": 0.6873481057898498, + "grad_norm": 0.6220553517341614, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0358, + "step": 24040 + }, + { + "epoch": 0.6876340243030736, + "grad_norm": 0.32508641481399536, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0284, + "step": 24050 + }, + { + "epoch": 0.6879199428162973, + "grad_norm": 0.4828036427497864, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0308, + "step": 24060 + }, + { + "epoch": 0.6882058613295211, + "grad_norm": 0.4809496998786926, + "learning_rate": 3.794650811106129e-06, + "loss": 0.028, + "step": 24070 + }, + { + "epoch": 0.6884917798427448, + "grad_norm": 0.8497998714447021, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.037, + "step": 24080 + }, + { + "epoch": 0.6887776983559686, + "grad_norm": 0.758666455745697, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0323, + "step": 24090 + }, + { + "epoch": 0.6890636168691923, + "grad_norm": 0.40550050139427185, + "learning_rate": 3.777162510056721e-06, + "loss": 0.0359, + "step": 24100 + }, + { + "epoch": 0.6893495353824161, + "grad_norm": 0.4595869779586792, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0304, + "step": 24110 + }, + { + "epoch": 0.6896354538956397, + "grad_norm": 0.5098794102668762, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0279, + "step": 24120 + }, + { + "epoch": 0.6899213724088634, + "grad_norm": 0.3320889174938202, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0287, + "step": 24130 + }, + { + "epoch": 0.6902072909220872, + "grad_norm": 0.4708438515663147, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.0289, + "step": 24140 + }, + { + "epoch": 0.6904932094353109, + "grad_norm": 1.0990219116210938, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0303, + "step": 24150 + }, + { + "epoch": 0.6907791279485347, + "grad_norm": 0.5109107494354248, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0306, + "step": 24160 + }, + { + "epoch": 0.6910650464617584, + "grad_norm": 0.6247434616088867, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0332, + "step": 24170 + }, + { + "epoch": 0.6913509649749822, + "grad_norm": 0.4033079743385315, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0252, + "step": 24180 + }, + { + "epoch": 0.6916368834882058, + "grad_norm": 0.36993420124053955, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0287, + "step": 24190 + }, + { + "epoch": 0.6919228020014296, + "grad_norm": 0.37320762872695923, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0364, + "step": 24200 + }, + { + "epoch": 0.6922087205146533, + "grad_norm": 0.6411201357841492, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0306, + "step": 24210 + }, + { + "epoch": 0.692494639027877, + "grad_norm": 0.7033433318138123, + "learning_rate": 3.707974016467e-06, + "loss": 0.0334, + "step": 24220 + }, + { + "epoch": 0.6927805575411008, + "grad_norm": 0.5307570695877075, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0338, + "step": 24230 + }, + { + "epoch": 0.6930664760543245, + "grad_norm": 0.6726395487785339, + "learning_rate": 3.696562092850226e-06, + "loss": 0.0379, + "step": 24240 + }, + { + "epoch": 0.6933523945675483, + "grad_norm": 0.5609936714172363, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0272, + "step": 24250 + }, + { + "epoch": 0.693638313080772, + "grad_norm": 0.5961005687713623, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0361, + "step": 24260 + }, + { + "epoch": 0.6939242315939957, + "grad_norm": 0.46744176745414734, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0291, + "step": 24270 + }, + { + "epoch": 0.6942101501072194, + "grad_norm": 0.5180732607841492, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0377, + "step": 24280 + }, + { + "epoch": 0.6944960686204432, + "grad_norm": 0.594201922416687, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0312, + "step": 24290 + }, + { + "epoch": 0.6947819871336669, + "grad_norm": 0.5852509140968323, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0303, + "step": 24300 + }, + { + "epoch": 0.6950679056468906, + "grad_norm": 0.7885274291038513, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0329, + "step": 24310 + }, + { + "epoch": 0.6953538241601144, + "grad_norm": 0.5280163884162903, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.031, + "step": 24320 + }, + { + "epoch": 0.6956397426733381, + "grad_norm": 0.6047127842903137, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0283, + "step": 24330 + }, + { + "epoch": 0.6959256611865619, + "grad_norm": 0.43192219734191895, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0338, + "step": 24340 + }, + { + "epoch": 0.6962115796997855, + "grad_norm": 0.3320246636867523, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0262, + "step": 24350 + }, + { + "epoch": 0.6964974982130093, + "grad_norm": 0.46365252137184143, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0342, + "step": 24360 + }, + { + "epoch": 0.696783416726233, + "grad_norm": 0.537933886051178, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.0286, + "step": 24370 + }, + { + "epoch": 0.6970693352394568, + "grad_norm": 0.3574221134185791, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0342, + "step": 24380 + }, + { + "epoch": 0.6973552537526805, + "grad_norm": 0.7051029205322266, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0316, + "step": 24390 + }, + { + "epoch": 0.6976411722659042, + "grad_norm": 0.587533712387085, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0302, + "step": 24400 + }, + { + "epoch": 0.697927090779128, + "grad_norm": 0.555778980255127, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0301, + "step": 24410 + }, + { + "epoch": 0.6982130092923516, + "grad_norm": 0.44060736894607544, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0296, + "step": 24420 + }, + { + "epoch": 0.6984989278055754, + "grad_norm": 0.3930843472480774, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0327, + "step": 24430 + }, + { + "epoch": 0.6987848463187991, + "grad_norm": 0.8878913521766663, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0389, + "step": 24440 + }, + { + "epoch": 0.6990707648320229, + "grad_norm": 0.45810988545417786, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0322, + "step": 24450 + }, + { + "epoch": 0.6993566833452466, + "grad_norm": 0.41808775067329407, + "learning_rate": 3.573305344104808e-06, + "loss": 0.032, + "step": 24460 + }, + { + "epoch": 0.6996426018584704, + "grad_norm": 0.5060444474220276, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0317, + "step": 24470 + }, + { + "epoch": 0.6999285203716941, + "grad_norm": 0.28741514682769775, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0271, + "step": 24480 + }, + { + "epoch": 0.7002144388849177, + "grad_norm": 0.5564437508583069, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0279, + "step": 24490 + }, + { + "epoch": 0.7005003573981415, + "grad_norm": 0.43762925267219543, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0317, + "step": 24500 + }, + { + "epoch": 0.7007862759113652, + "grad_norm": 0.46590355038642883, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0314, + "step": 24510 + }, + { + "epoch": 0.701072194424589, + "grad_norm": 0.640477180480957, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0258, + "step": 24520 + }, + { + "epoch": 0.7013581129378127, + "grad_norm": 0.5845742225646973, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.0283, + "step": 24530 + }, + { + "epoch": 0.7016440314510365, + "grad_norm": 0.5625128746032715, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0381, + "step": 24540 + }, + { + "epoch": 0.7019299499642602, + "grad_norm": 0.4365232586860657, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0297, + "step": 24550 + }, + { + "epoch": 0.702215868477484, + "grad_norm": 0.5942055583000183, + "learning_rate": 3.518669865884119e-06, + "loss": 0.034, + "step": 24560 + }, + { + "epoch": 0.7025017869907076, + "grad_norm": 0.3847256302833557, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0293, + "step": 24570 + }, + { + "epoch": 0.7027877055039313, + "grad_norm": 0.542539119720459, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0327, + "step": 24580 + }, + { + "epoch": 0.7030736240171551, + "grad_norm": 0.5383610129356384, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0322, + "step": 24590 + }, + { + "epoch": 0.7033595425303788, + "grad_norm": 0.6085273027420044, + "learning_rate": 3.497061149826966e-06, + "loss": 0.0293, + "step": 24600 + }, + { + "epoch": 0.7036454610436026, + "grad_norm": 0.5107666254043579, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0266, + "step": 24610 + }, + { + "epoch": 0.7039313795568263, + "grad_norm": 0.4976873993873596, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0301, + "step": 24620 + }, + { + "epoch": 0.7042172980700501, + "grad_norm": 0.5735257863998413, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.0264, + "step": 24630 + }, + { + "epoch": 0.7045032165832738, + "grad_norm": 0.6035013794898987, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0286, + "step": 24640 + }, + { + "epoch": 0.7047891350964975, + "grad_norm": 0.5665635466575623, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0345, + "step": 24650 + }, + { + "epoch": 0.7050750536097212, + "grad_norm": 0.5783578753471375, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.0385, + "step": 24660 + }, + { + "epoch": 0.7053609721229449, + "grad_norm": 0.3957138657569885, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.0319, + "step": 24670 + }, + { + "epoch": 0.7056468906361687, + "grad_norm": 0.32982495427131653, + "learning_rate": 3.454266765790622e-06, + "loss": 0.034, + "step": 24680 + }, + { + "epoch": 0.7059328091493924, + "grad_norm": 0.5827629566192627, + "learning_rate": 3.448957251110008e-06, + "loss": 0.029, + "step": 24690 + }, + { + "epoch": 0.7062187276626162, + "grad_norm": 0.28891173005104065, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0328, + "step": 24700 + }, + { + "epoch": 0.7065046461758399, + "grad_norm": 0.7992371320724487, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0333, + "step": 24710 + }, + { + "epoch": 0.7067905646890636, + "grad_norm": 0.5976162552833557, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0327, + "step": 24720 + }, + { + "epoch": 0.7070764832022873, + "grad_norm": 0.4785068929195404, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0334, + "step": 24730 + }, + { + "epoch": 0.7073624017155111, + "grad_norm": 0.6561854481697083, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0317, + "step": 24740 + }, + { + "epoch": 0.7076483202287348, + "grad_norm": 0.6745696067810059, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.0289, + "step": 24750 + }, + { + "epoch": 0.7079342387419585, + "grad_norm": 0.4914945960044861, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0306, + "step": 24760 + }, + { + "epoch": 0.7082201572551823, + "grad_norm": 0.35789182782173157, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0327, + "step": 24770 + }, + { + "epoch": 0.708506075768406, + "grad_norm": 0.416161447763443, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0313, + "step": 24780 + }, + { + "epoch": 0.7087919942816298, + "grad_norm": 0.6271718740463257, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0314, + "step": 24790 + }, + { + "epoch": 0.7090779127948534, + "grad_norm": 0.5230259895324707, + "learning_rate": 3.391138816571675e-06, + "loss": 0.037, + "step": 24800 + }, + { + "epoch": 0.7093638313080772, + "grad_norm": 0.54779452085495, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0364, + "step": 24810 + }, + { + "epoch": 0.7096497498213009, + "grad_norm": 0.6326698064804077, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0287, + "step": 24820 + }, + { + "epoch": 0.7099356683345247, + "grad_norm": 0.576437771320343, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0321, + "step": 24830 + }, + { + "epoch": 0.7102215868477484, + "grad_norm": 0.49094530940055847, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0328, + "step": 24840 + }, + { + "epoch": 0.7105075053609721, + "grad_norm": 3.1826400756835938, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0497, + "step": 24850 + }, + { + "epoch": 0.7107934238741959, + "grad_norm": 0.6048339009284973, + "learning_rate": 3.36005636574796e-06, + "loss": 0.0429, + "step": 24860 + }, + { + "epoch": 0.7110793423874195, + "grad_norm": 0.6633393168449402, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0287, + "step": 24870 + }, + { + "epoch": 0.7113652609006433, + "grad_norm": 0.24930168688297272, + "learning_rate": 3.349767211300933e-06, + "loss": 0.027, + "step": 24880 + }, + { + "epoch": 0.711651179413867, + "grad_norm": 0.3934503495693207, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0256, + "step": 24890 + }, + { + "epoch": 0.7119370979270908, + "grad_norm": 0.7811068892478943, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.03, + "step": 24900 + }, + { + "epoch": 0.7122230164403145, + "grad_norm": 0.4274163246154785, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0263, + "step": 24910 + }, + { + "epoch": 0.7125089349535383, + "grad_norm": 0.5188158750534058, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0264, + "step": 24920 + }, + { + "epoch": 0.712794853466762, + "grad_norm": 0.4106016457080841, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0309, + "step": 24930 + }, + { + "epoch": 0.7130807719799857, + "grad_norm": 0.5283434987068176, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0335, + "step": 24940 + }, + { + "epoch": 0.7133666904932094, + "grad_norm": 0.38160789012908936, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0313, + "step": 24950 + }, + { + "epoch": 0.7136526090064331, + "grad_norm": 0.30552029609680176, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0265, + "step": 24960 + }, + { + "epoch": 0.7139385275196569, + "grad_norm": 0.40023618936538696, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0295, + "step": 24970 + }, + { + "epoch": 0.7142244460328806, + "grad_norm": 0.3569220006465912, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0259, + "step": 24980 + }, + { + "epoch": 0.7145103645461044, + "grad_norm": 0.39430442452430725, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0313, + "step": 24990 + }, + { + "epoch": 0.7147962830593281, + "grad_norm": 0.5891808271408081, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0272, + "step": 25000 + }, + { + "epoch": 0.7150822015725519, + "grad_norm": 0.487945556640625, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.0308, + "step": 25010 + }, + { + "epoch": 0.7153681200857755, + "grad_norm": 0.551268458366394, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.026, + "step": 25020 + }, + { + "epoch": 0.7156540385989992, + "grad_norm": 0.7384896278381348, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0371, + "step": 25030 + }, + { + "epoch": 0.715939957112223, + "grad_norm": 0.43013718724250793, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0348, + "step": 25040 + }, + { + "epoch": 0.7162258756254467, + "grad_norm": 0.28747591376304626, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0249, + "step": 25050 + }, + { + "epoch": 0.7165117941386705, + "grad_norm": 0.48107975721359253, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0247, + "step": 25060 + }, + { + "epoch": 0.7167977126518942, + "grad_norm": 0.4077073931694031, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0313, + "step": 25070 + }, + { + "epoch": 0.717083631165118, + "grad_norm": 0.7853788137435913, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0286, + "step": 25080 + }, + { + "epoch": 0.7173695496783417, + "grad_norm": 0.6021899580955505, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0316, + "step": 25090 + }, + { + "epoch": 0.7176554681915654, + "grad_norm": 0.5997788906097412, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0286, + "step": 25100 + }, + { + "epoch": 0.7179413867047891, + "grad_norm": 0.47682714462280273, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0329, + "step": 25110 + }, + { + "epoch": 0.7182273052180128, + "grad_norm": 0.6501848697662354, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0289, + "step": 25120 + }, + { + "epoch": 0.7185132237312366, + "grad_norm": 1.000689148902893, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0351, + "step": 25130 + }, + { + "epoch": 0.7187991422444603, + "grad_norm": 0.5946705937385559, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0268, + "step": 25140 + }, + { + "epoch": 0.7190850607576841, + "grad_norm": 0.46967631578445435, + "learning_rate": 3.214397932123149e-06, + "loss": 0.031, + "step": 25150 + }, + { + "epoch": 0.7193709792709078, + "grad_norm": 1.052093744277954, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0302, + "step": 25160 + }, + { + "epoch": 0.7196568977841316, + "grad_norm": 0.9337649941444397, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0304, + "step": 25170 + }, + { + "epoch": 0.7199428162973552, + "grad_norm": 0.423648864030838, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0297, + "step": 25180 + }, + { + "epoch": 0.720228734810579, + "grad_norm": 0.46862924098968506, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.028, + "step": 25190 + }, + { + "epoch": 0.7205146533238027, + "grad_norm": 0.7099304795265198, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0306, + "step": 25200 + }, + { + "epoch": 0.7208005718370264, + "grad_norm": 0.5219885110855103, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0269, + "step": 25210 + }, + { + "epoch": 0.7210864903502502, + "grad_norm": 0.6347305774688721, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0307, + "step": 25220 + }, + { + "epoch": 0.7213724088634739, + "grad_norm": 0.7043943405151367, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0267, + "step": 25230 + }, + { + "epoch": 0.7216583273766977, + "grad_norm": 0.4137915074825287, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.028, + "step": 25240 + }, + { + "epoch": 0.7219442458899213, + "grad_norm": 0.4374844431877136, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0246, + "step": 25250 + }, + { + "epoch": 0.7222301644031451, + "grad_norm": 0.6796316504478455, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0285, + "step": 25260 + }, + { + "epoch": 0.7225160829163688, + "grad_norm": 0.4662792980670929, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0287, + "step": 25270 + }, + { + "epoch": 0.7228020014295926, + "grad_norm": 0.4035339653491974, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0289, + "step": 25280 + }, + { + "epoch": 0.7230879199428163, + "grad_norm": 0.40217533707618713, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0238, + "step": 25290 + }, + { + "epoch": 0.72337383845604, + "grad_norm": 0.3640667796134949, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0303, + "step": 25300 + }, + { + "epoch": 0.7236597569692638, + "grad_norm": 0.38176655769348145, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.0283, + "step": 25310 + }, + { + "epoch": 0.7239456754824874, + "grad_norm": 0.40747207403182983, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.031, + "step": 25320 + }, + { + "epoch": 0.7242315939957112, + "grad_norm": 0.3859431743621826, + "learning_rate": 3.127844986891409e-06, + "loss": 0.0306, + "step": 25330 + }, + { + "epoch": 0.7245175125089349, + "grad_norm": 0.23738636076450348, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0313, + "step": 25340 + }, + { + "epoch": 0.7248034310221587, + "grad_norm": 0.3772980272769928, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0308, + "step": 25350 + }, + { + "epoch": 0.7250893495353824, + "grad_norm": 0.5451138019561768, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.03, + "step": 25360 + }, + { + "epoch": 0.7253752680486062, + "grad_norm": 0.6431843638420105, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0371, + "step": 25370 + }, + { + "epoch": 0.7256611865618299, + "grad_norm": 0.42552369832992554, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0311, + "step": 25380 + }, + { + "epoch": 0.7259471050750536, + "grad_norm": 0.5802433490753174, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0316, + "step": 25390 + }, + { + "epoch": 0.7262330235882773, + "grad_norm": 0.31489041447639465, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0282, + "step": 25400 + }, + { + "epoch": 0.726518942101501, + "grad_norm": 0.4227478504180908, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0274, + "step": 25410 + }, + { + "epoch": 0.7268048606147248, + "grad_norm": 0.5510851740837097, + "learning_rate": 3.085688933413021e-06, + "loss": 0.0297, + "step": 25420 + }, + { + "epoch": 0.7270907791279485, + "grad_norm": 0.3073323667049408, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0244, + "step": 25430 + }, + { + "epoch": 0.7273766976411723, + "grad_norm": 0.7394781112670898, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.028, + "step": 25440 + }, + { + "epoch": 0.727662616154396, + "grad_norm": 0.5067957639694214, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0374, + "step": 25450 + }, + { + "epoch": 0.7279485346676198, + "grad_norm": 0.4093882739543915, + "learning_rate": 3.067194157156521e-06, + "loss": 0.0347, + "step": 25460 + }, + { + "epoch": 0.7282344531808435, + "grad_norm": 0.37054866552352905, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0305, + "step": 25470 + }, + { + "epoch": 0.7285203716940671, + "grad_norm": 0.38795027136802673, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0282, + "step": 25480 + }, + { + "epoch": 0.7288062902072909, + "grad_norm": 0.49282407760620117, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.0301, + "step": 25490 + }, + { + "epoch": 0.7290922087205146, + "grad_norm": 0.5234564542770386, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0352, + "step": 25500 + }, + { + "epoch": 0.7293781272337384, + "grad_norm": 0.5383297801017761, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0317, + "step": 25510 + }, + { + "epoch": 0.7296640457469621, + "grad_norm": 0.4277333617210388, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0278, + "step": 25520 + }, + { + "epoch": 0.7299499642601859, + "grad_norm": 0.6099430322647095, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0356, + "step": 25530 + }, + { + "epoch": 0.7302358827734096, + "grad_norm": 0.38870710134506226, + "learning_rate": 3.030651808761638e-06, + "loss": 0.027, + "step": 25540 + }, + { + "epoch": 0.7305218012866334, + "grad_norm": 0.48884090781211853, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0251, + "step": 25550 + }, + { + "epoch": 0.730807719799857, + "grad_norm": 0.5136672258377075, + "learning_rate": 3.021609639602321e-06, + "loss": 0.025, + "step": 25560 + }, + { + "epoch": 0.7310936383130807, + "grad_norm": 0.527056872844696, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.03, + "step": 25570 + }, + { + "epoch": 0.7313795568263045, + "grad_norm": 0.7081360220909119, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0303, + "step": 25580 + }, + { + "epoch": 0.7316654753395282, + "grad_norm": 0.48397257924079895, + "learning_rate": 3.008116622200155e-06, + "loss": 0.032, + "step": 25590 + }, + { + "epoch": 0.731951393852752, + "grad_norm": 0.38431495428085327, + "learning_rate": 3.003637700546652e-06, + "loss": 0.0337, + "step": 25600 + }, + { + "epoch": 0.7322373123659757, + "grad_norm": 0.48320460319519043, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0336, + "step": 25610 + }, + { + "epoch": 0.7325232308791995, + "grad_norm": 0.3164500892162323, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0244, + "step": 25620 + }, + { + "epoch": 0.7328091493924231, + "grad_norm": 0.5140587091445923, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0267, + "step": 25630 + }, + { + "epoch": 0.7330950679056469, + "grad_norm": 0.30739104747772217, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0373, + "step": 25640 + }, + { + "epoch": 0.7333809864188706, + "grad_norm": 0.3579956591129303, + "learning_rate": 2.981383959667165e-06, + "loss": 0.0328, + "step": 25650 + }, + { + "epoch": 0.7336669049320943, + "grad_norm": 0.7733256220817566, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0335, + "step": 25660 + }, + { + "epoch": 0.7339528234453181, + "grad_norm": 0.5355008244514465, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0291, + "step": 25670 + }, + { + "epoch": 0.7342387419585418, + "grad_norm": 0.5733621120452881, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0223, + "step": 25680 + }, + { + "epoch": 0.7345246604717656, + "grad_norm": 0.4484233260154724, + "learning_rate": 2.963750320724704e-06, + "loss": 0.03, + "step": 25690 + }, + { + "epoch": 0.7348105789849892, + "grad_norm": 0.46975597739219666, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0325, + "step": 25700 + }, + { + "epoch": 0.735096497498213, + "grad_norm": 0.4674699008464813, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0321, + "step": 25710 + }, + { + "epoch": 0.7353824160114367, + "grad_norm": 0.301565557718277, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.0279, + "step": 25720 + }, + { + "epoch": 0.7356683345246605, + "grad_norm": 0.41966041922569275, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0319, + "step": 25730 + }, + { + "epoch": 0.7359542530378842, + "grad_norm": 0.5388277173042297, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0287, + "step": 25740 + }, + { + "epoch": 0.7362401715511079, + "grad_norm": 0.5821589231491089, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0298, + "step": 25750 + }, + { + "epoch": 0.7365260900643317, + "grad_norm": 0.9340733289718628, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0307, + "step": 25760 + }, + { + "epoch": 0.7368120085775554, + "grad_norm": 0.3654371201992035, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0284, + "step": 25770 + }, + { + "epoch": 0.7370979270907791, + "grad_norm": 0.38794293999671936, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0306, + "step": 25780 + }, + { + "epoch": 0.7373838456040028, + "grad_norm": 0.39955422282218933, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.0324, + "step": 25790 + }, + { + "epoch": 0.7376697641172266, + "grad_norm": 0.5864313244819641, + "learning_rate": 2.916036854664115e-06, + "loss": 0.031, + "step": 25800 + }, + { + "epoch": 0.7379556826304503, + "grad_norm": 0.4324203431606293, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0253, + "step": 25810 + }, + { + "epoch": 0.7382416011436741, + "grad_norm": 0.6346203684806824, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0278, + "step": 25820 + }, + { + "epoch": 0.7385275196568978, + "grad_norm": 0.3984649181365967, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0352, + "step": 25830 + }, + { + "epoch": 0.7388134381701215, + "grad_norm": 0.3954542577266693, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0305, + "step": 25840 + }, + { + "epoch": 0.7390993566833453, + "grad_norm": 0.3119542598724365, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.0372, + "step": 25850 + }, + { + "epoch": 0.7393852751965689, + "grad_norm": 0.4094623029232025, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0251, + "step": 25860 + }, + { + "epoch": 0.7396711937097927, + "grad_norm": 0.5250104665756226, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0302, + "step": 25870 + }, + { + "epoch": 0.7399571122230164, + "grad_norm": 0.7610230445861816, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0257, + "step": 25880 + }, + { + "epoch": 0.7402430307362402, + "grad_norm": 0.5546014904975891, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0249, + "step": 25890 + }, + { + "epoch": 0.7405289492494639, + "grad_norm": 0.22835634648799896, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0287, + "step": 25900 + }, + { + "epoch": 0.7408148677626877, + "grad_norm": 0.7073826789855957, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0409, + "step": 25910 + }, + { + "epoch": 0.7411007862759114, + "grad_norm": 0.604634165763855, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0293, + "step": 25920 + }, + { + "epoch": 0.741386704789135, + "grad_norm": 0.46605581045150757, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0261, + "step": 25930 + }, + { + "epoch": 0.7416726233023588, + "grad_norm": 0.35719090700149536, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0304, + "step": 25940 + }, + { + "epoch": 0.7419585418155825, + "grad_norm": 0.3806651532649994, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0313, + "step": 25950 + }, + { + "epoch": 0.7422444603288063, + "grad_norm": 0.6443240642547607, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0303, + "step": 25960 + }, + { + "epoch": 0.74253037884203, + "grad_norm": 0.42187514901161194, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.0282, + "step": 25970 + }, + { + "epoch": 0.7428162973552538, + "grad_norm": 0.4213440418243408, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0312, + "step": 25980 + }, + { + "epoch": 0.7431022158684775, + "grad_norm": 0.3982003331184387, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0279, + "step": 25990 + }, + { + "epoch": 0.7433881343817013, + "grad_norm": 0.3418596386909485, + "learning_rate": 2.832230653119002e-06, + "loss": 0.0318, + "step": 26000 + }, + { + "epoch": 0.7436740528949249, + "grad_norm": 0.3633996844291687, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0301, + "step": 26010 + }, + { + "epoch": 0.7439599714081486, + "grad_norm": 0.362079918384552, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.028, + "step": 26020 + }, + { + "epoch": 0.7442458899213724, + "grad_norm": 0.4734862744808197, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.031, + "step": 26030 + }, + { + "epoch": 0.7445318084345961, + "grad_norm": 0.31540775299072266, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0287, + "step": 26040 + }, + { + "epoch": 0.7448177269478199, + "grad_norm": 0.6774418950080872, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.029, + "step": 26050 + }, + { + "epoch": 0.7451036454610436, + "grad_norm": 0.3063428997993469, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0308, + "step": 26060 + }, + { + "epoch": 0.7453895639742674, + "grad_norm": 0.691943347454071, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.0265, + "step": 26070 + }, + { + "epoch": 0.745675482487491, + "grad_norm": 0.5507379174232483, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0286, + "step": 26080 + }, + { + "epoch": 0.7459614010007148, + "grad_norm": 0.34355828166007996, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.024, + "step": 26090 + }, + { + "epoch": 0.7462473195139385, + "grad_norm": 0.5120819807052612, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0288, + "step": 26100 + }, + { + "epoch": 0.7465332380271622, + "grad_norm": 0.5197821259498596, + "learning_rate": 2.78776903555923e-06, + "loss": 0.028, + "step": 26110 + }, + { + "epoch": 0.746819156540386, + "grad_norm": 0.46328091621398926, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.0247, + "step": 26120 + }, + { + "epoch": 0.7471050750536097, + "grad_norm": 0.6205909848213196, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0306, + "step": 26130 + }, + { + "epoch": 0.7473909935668335, + "grad_norm": 0.4201740622520447, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.025, + "step": 26140 + }, + { + "epoch": 0.7476769120800572, + "grad_norm": 0.23724111914634705, + "learning_rate": 2.771889969647e-06, + "loss": 0.0283, + "step": 26150 + }, + { + "epoch": 0.747962830593281, + "grad_norm": 0.8046770691871643, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0318, + "step": 26160 + }, + { + "epoch": 0.7482487491065046, + "grad_norm": 0.5273832082748413, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0342, + "step": 26170 + }, + { + "epoch": 0.7485346676197284, + "grad_norm": 0.923651397228241, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0255, + "step": 26180 + }, + { + "epoch": 0.7488205861329521, + "grad_norm": 0.6395840644836426, + "learning_rate": 2.756165401834933e-06, + "loss": 0.0277, + "step": 26190 + }, + { + "epoch": 0.7491065046461758, + "grad_norm": 0.44334620237350464, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.0285, + "step": 26200 + }, + { + "epoch": 0.7493924231593996, + "grad_norm": 0.47904232144355774, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0282, + "step": 26210 + }, + { + "epoch": 0.7496783416726233, + "grad_norm": 0.9316203594207764, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0307, + "step": 26220 + }, + { + "epoch": 0.749964260185847, + "grad_norm": 0.5045170783996582, + "learning_rate": 2.740595627381096e-06, + "loss": 0.0242, + "step": 26230 + }, + { + "epoch": 0.7502501786990707, + "grad_norm": 0.54493248462677, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0263, + "step": 26240 + }, + { + "epoch": 0.7505360972122945, + "grad_norm": 0.6128116846084595, + "learning_rate": 2.732868879137055e-06, + "loss": 0.0305, + "step": 26250 + }, + { + "epoch": 0.7508220157255182, + "grad_norm": 0.6235067844390869, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.037, + "step": 26260 + }, + { + "epoch": 0.751107934238742, + "grad_norm": 0.43458008766174316, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0274, + "step": 26270 + }, + { + "epoch": 0.7513938527519657, + "grad_norm": 0.5540400147438049, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0289, + "step": 26280 + }, + { + "epoch": 0.7516797712651894, + "grad_norm": 0.4317619204521179, + "learning_rate": 2.717531841969889e-06, + "loss": 0.0313, + "step": 26290 + }, + { + "epoch": 0.7519656897784132, + "grad_norm": 0.42271071672439575, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0291, + "step": 26300 + }, + { + "epoch": 0.7522516082916368, + "grad_norm": 0.6096150875091553, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0374, + "step": 26310 + }, + { + "epoch": 0.7525375268048606, + "grad_norm": 0.5820568799972534, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.027, + "step": 26320 + }, + { + "epoch": 0.7528234453180843, + "grad_norm": 0.4441884756088257, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0258, + "step": 26330 + }, + { + "epoch": 0.7531093638313081, + "grad_norm": 0.48442211747169495, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.0257, + "step": 26340 + }, + { + "epoch": 0.7533952823445318, + "grad_norm": 0.7179747223854065, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0281, + "step": 26350 + }, + { + "epoch": 0.7536812008577556, + "grad_norm": 0.5399336218833923, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.03, + "step": 26360 + }, + { + "epoch": 0.7539671193709793, + "grad_norm": 0.5521562099456787, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.0267, + "step": 26370 + }, + { + "epoch": 0.754253037884203, + "grad_norm": 0.3727903366088867, + "learning_rate": 2.683592557860616e-06, + "loss": 0.0274, + "step": 26380 + }, + { + "epoch": 0.7545389563974267, + "grad_norm": 0.5607078671455383, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.033, + "step": 26390 + }, + { + "epoch": 0.7548248749106504, + "grad_norm": 0.3736121654510498, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.0267, + "step": 26400 + }, + { + "epoch": 0.7551107934238742, + "grad_norm": 0.47778844833374023, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0246, + "step": 26410 + }, + { + "epoch": 0.7553967119370979, + "grad_norm": 0.5479125380516052, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.0273, + "step": 26420 + }, + { + "epoch": 0.7556826304503217, + "grad_norm": 0.5152542591094971, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0288, + "step": 26430 + }, + { + "epoch": 0.7559685489635454, + "grad_norm": 0.38652661442756653, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0319, + "step": 26440 + }, + { + "epoch": 0.7562544674767692, + "grad_norm": 0.8551011085510254, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.0312, + "step": 26450 + }, + { + "epoch": 0.7565403859899928, + "grad_norm": 0.5332438349723816, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0348, + "step": 26460 + }, + { + "epoch": 0.7568263045032165, + "grad_norm": 0.5529776215553284, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.0305, + "step": 26470 + }, + { + "epoch": 0.7571122230164403, + "grad_norm": 0.47610723972320557, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.027, + "step": 26480 + }, + { + "epoch": 0.757398141529664, + "grad_norm": 0.5565681457519531, + "learning_rate": 2.643185095075473e-06, + "loss": 0.0277, + "step": 26490 + }, + { + "epoch": 0.7576840600428878, + "grad_norm": 0.40319734811782837, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0287, + "step": 26500 + }, + { + "epoch": 0.7579699785561115, + "grad_norm": 0.5117385387420654, + "learning_rate": 2.635965610168249e-06, + "loss": 0.0312, + "step": 26510 + }, + { + "epoch": 0.7582558970693353, + "grad_norm": 0.47812822461128235, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0287, + "step": 26520 + }, + { + "epoch": 0.758541815582559, + "grad_norm": 0.24216991662979126, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0321, + "step": 26530 + }, + { + "epoch": 0.7588277340957827, + "grad_norm": 0.24864375591278076, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0312, + "step": 26540 + }, + { + "epoch": 0.7591136526090064, + "grad_norm": 0.39162659645080566, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0333, + "step": 26550 + }, + { + "epoch": 0.7593995711222301, + "grad_norm": 0.30692365765571594, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.0261, + "step": 26560 + }, + { + "epoch": 0.7596854896354539, + "grad_norm": 0.5904929041862488, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0309, + "step": 26570 + }, + { + "epoch": 0.7599714081486776, + "grad_norm": 0.5509836673736572, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0223, + "step": 26580 + }, + { + "epoch": 0.7602573266619014, + "grad_norm": 0.45913293957710266, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0338, + "step": 26590 + }, + { + "epoch": 0.7605432451751251, + "grad_norm": 0.3952873647212982, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.0283, + "step": 26600 + }, + { + "epoch": 0.7608291636883489, + "grad_norm": 0.49259039759635925, + "learning_rate": 2.600457796416397e-06, + "loss": 0.0262, + "step": 26610 + }, + { + "epoch": 0.7611150822015725, + "grad_norm": 0.49096909165382385, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.0265, + "step": 26620 + }, + { + "epoch": 0.7614010007147963, + "grad_norm": 0.48913729190826416, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0347, + "step": 26630 + }, + { + "epoch": 0.76168691922802, + "grad_norm": 0.391233891248703, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0281, + "step": 26640 + }, + { + "epoch": 0.7619728377412437, + "grad_norm": 0.3726404011249542, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0252, + "step": 26650 + }, + { + "epoch": 0.7622587562544675, + "grad_norm": 0.441919207572937, + "learning_rate": 2.583073279935805e-06, + "loss": 0.025, + "step": 26660 + }, + { + "epoch": 0.7625446747676912, + "grad_norm": 0.6720325350761414, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.0264, + "step": 26670 + }, + { + "epoch": 0.762830593280915, + "grad_norm": 0.4706156849861145, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0307, + "step": 26680 + }, + { + "epoch": 0.7631165117941386, + "grad_norm": 0.6154748797416687, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0283, + "step": 26690 + }, + { + "epoch": 0.7634024303073624, + "grad_norm": 0.4765104651451111, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.0292, + "step": 26700 + }, + { + "epoch": 0.7636883488205861, + "grad_norm": 0.33775731921195984, + "learning_rate": 2.565935706183804e-06, + "loss": 0.0281, + "step": 26710 + }, + { + "epoch": 0.7639742673338099, + "grad_norm": 0.9325317144393921, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0282, + "step": 26720 + }, + { + "epoch": 0.7642601858470336, + "grad_norm": 0.5118368864059448, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0264, + "step": 26730 + }, + { + "epoch": 0.7645461043602573, + "grad_norm": 0.6633817553520203, + "learning_rate": 2.555771903907403e-06, + "loss": 0.035, + "step": 26740 + }, + { + "epoch": 0.7648320228734811, + "grad_norm": 0.8666901588439941, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0299, + "step": 26750 + }, + { + "epoch": 0.7651179413867047, + "grad_norm": 0.47465914487838745, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0281, + "step": 26760 + }, + { + "epoch": 0.7654038598999285, + "grad_norm": 0.5317928791046143, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0237, + "step": 26770 + }, + { + "epoch": 0.7656897784131522, + "grad_norm": 0.6626484394073486, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.0297, + "step": 26780 + }, + { + "epoch": 0.765975696926376, + "grad_norm": 0.5603852272033691, + "learning_rate": 2.5390304813179e-06, + "loss": 0.0279, + "step": 26790 + }, + { + "epoch": 0.7662616154395997, + "grad_norm": 0.392030268907547, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.0276, + "step": 26800 + }, + { + "epoch": 0.7665475339528235, + "grad_norm": 0.5270085334777832, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0283, + "step": 26810 + }, + { + "epoch": 0.7668334524660472, + "grad_norm": 0.5256703495979309, + "learning_rate": 2.529104749380281e-06, + "loss": 0.029, + "step": 26820 + }, + { + "epoch": 0.7671193709792709, + "grad_norm": 0.3960905075073242, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0297, + "step": 26830 + }, + { + "epoch": 0.7674052894924946, + "grad_norm": 0.4214257597923279, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0279, + "step": 26840 + }, + { + "epoch": 0.7676912080057183, + "grad_norm": 0.4516659677028656, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0268, + "step": 26850 + }, + { + "epoch": 0.7679771265189421, + "grad_norm": 0.4527135193347931, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0285, + "step": 26860 + }, + { + "epoch": 0.7682630450321658, + "grad_norm": 0.4458029270172119, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0282, + "step": 26870 + }, + { + "epoch": 0.7685489635453896, + "grad_norm": 0.5262351036071777, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.0289, + "step": 26880 + }, + { + "epoch": 0.7688348820586133, + "grad_norm": 0.7576776146888733, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0304, + "step": 26890 + }, + { + "epoch": 0.7691208005718371, + "grad_norm": 0.3779038190841675, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0308, + "step": 26900 + }, + { + "epoch": 0.7694067190850608, + "grad_norm": 0.5801526308059692, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0279, + "step": 26910 + }, + { + "epoch": 0.7696926375982844, + "grad_norm": 0.6423588991165161, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0291, + "step": 26920 + }, + { + "epoch": 0.7699785561115082, + "grad_norm": 0.3891446590423584, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0276, + "step": 26930 + }, + { + "epoch": 0.7702644746247319, + "grad_norm": 0.6453003883361816, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0297, + "step": 26940 + }, + { + "epoch": 0.7705503931379557, + "grad_norm": 0.5512704253196716, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0273, + "step": 26950 + }, + { + "epoch": 0.7708363116511794, + "grad_norm": 0.5719016790390015, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0284, + "step": 26960 + }, + { + "epoch": 0.7711222301644032, + "grad_norm": 0.325624942779541, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0249, + "step": 26970 + }, + { + "epoch": 0.7714081486776269, + "grad_norm": 0.5242589712142944, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0268, + "step": 26980 + }, + { + "epoch": 0.7716940671908507, + "grad_norm": 0.3835712969303131, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.0293, + "step": 26990 + }, + { + "epoch": 0.7719799857040743, + "grad_norm": 0.5894249081611633, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0303, + "step": 27000 + }, + { + "epoch": 0.772265904217298, + "grad_norm": 0.4519590437412262, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.0252, + "step": 27010 + }, + { + "epoch": 0.7725518227305218, + "grad_norm": 0.590528130531311, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0297, + "step": 27020 + }, + { + "epoch": 0.7728377412437455, + "grad_norm": 0.5418447852134705, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0306, + "step": 27030 + }, + { + "epoch": 0.7731236597569693, + "grad_norm": 1.027212142944336, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.0302, + "step": 27040 + }, + { + "epoch": 0.773409578270193, + "grad_norm": 0.5057966709136963, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.0295, + "step": 27050 + }, + { + "epoch": 0.7736954967834168, + "grad_norm": 0.9749689698219299, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.0288, + "step": 27060 + }, + { + "epoch": 0.7739814152966404, + "grad_norm": 0.7263986468315125, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.03, + "step": 27070 + }, + { + "epoch": 0.7742673338098642, + "grad_norm": 0.6080947518348694, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0312, + "step": 27080 + }, + { + "epoch": 0.7745532523230879, + "grad_norm": 0.5187621712684631, + "learning_rate": 2.443811559007335e-06, + "loss": 0.0235, + "step": 27090 + }, + { + "epoch": 0.7748391708363116, + "grad_norm": 0.6019864678382874, + "learning_rate": 2.440792688039862e-06, + "loss": 0.0356, + "step": 27100 + }, + { + "epoch": 0.7751250893495354, + "grad_norm": 0.4716169238090515, + "learning_rate": 2.437783861778914e-06, + "loss": 0.0241, + "step": 27110 + }, + { + "epoch": 0.7754110078627591, + "grad_norm": 0.2648717761039734, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.027, + "step": 27120 + }, + { + "epoch": 0.7756969263759829, + "grad_norm": 0.43119028210639954, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0278, + "step": 27130 + }, + { + "epoch": 0.7759828448892065, + "grad_norm": 0.37466534972190857, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0254, + "step": 27140 + }, + { + "epoch": 0.7762687634024303, + "grad_norm": 0.36353442072868347, + "learning_rate": 2.425849074243997e-06, + "loss": 0.0263, + "step": 27150 + }, + { + "epoch": 0.776554681915654, + "grad_norm": 0.35461705923080444, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0281, + "step": 27160 + }, + { + "epoch": 0.7768406004288778, + "grad_norm": 0.5017783045768738, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0312, + "step": 27170 + }, + { + "epoch": 0.7771265189421015, + "grad_norm": 0.461370050907135, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0301, + "step": 27180 + }, + { + "epoch": 0.7774124374553252, + "grad_norm": 0.3844483494758606, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0276, + "step": 27190 + }, + { + "epoch": 0.777698355968549, + "grad_norm": 0.32640641927719116, + "learning_rate": 2.411157015949963e-06, + "loss": 0.0262, + "step": 27200 + }, + { + "epoch": 0.7779842744817727, + "grad_norm": 0.6539550423622131, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0303, + "step": 27210 + }, + { + "epoch": 0.7782701929949964, + "grad_norm": 0.5505805015563965, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0267, + "step": 27220 + }, + { + "epoch": 0.7785561115082201, + "grad_norm": 0.433768630027771, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0299, + "step": 27230 + }, + { + "epoch": 0.7788420300214439, + "grad_norm": 0.7262346148490906, + "learning_rate": 2.399584779188417e-06, + "loss": 0.0278, + "step": 27240 + }, + { + "epoch": 0.7791279485346676, + "grad_norm": 0.6827511787414551, + "learning_rate": 2.396716944205467e-06, + "loss": 0.0319, + "step": 27250 + }, + { + "epoch": 0.7794138670478914, + "grad_norm": 0.3138200342655182, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.0261, + "step": 27260 + }, + { + "epoch": 0.7796997855611151, + "grad_norm": 0.36588770151138306, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0251, + "step": 27270 + }, + { + "epoch": 0.7799857040743388, + "grad_norm": 1.105770468711853, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0272, + "step": 27280 + }, + { + "epoch": 0.7802716225875626, + "grad_norm": 0.4482360780239105, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.0247, + "step": 27290 + }, + { + "epoch": 0.7805575411007862, + "grad_norm": 0.5545430779457092, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0307, + "step": 27300 + }, + { + "epoch": 0.78084345961401, + "grad_norm": 0.45449620485305786, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0264, + "step": 27310 + }, + { + "epoch": 0.7811293781272337, + "grad_norm": 0.37734025716781616, + "learning_rate": 2.376924986395271e-06, + "loss": 0.0275, + "step": 27320 + }, + { + "epoch": 0.7814152966404575, + "grad_norm": 0.47029784321784973, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0325, + "step": 27330 + }, + { + "epoch": 0.7817012151536812, + "grad_norm": 0.3540012240409851, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0258, + "step": 27340 + }, + { + "epoch": 0.781987133666905, + "grad_norm": 0.8363472819328308, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0288, + "step": 27350 + }, + { + "epoch": 0.7822730521801287, + "grad_norm": 0.5943127274513245, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.0289, + "step": 27360 + }, + { + "epoch": 0.7825589706933523, + "grad_norm": 0.48346707224845886, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0304, + "step": 27370 + }, + { + "epoch": 0.7828448892065761, + "grad_norm": 0.5776712894439697, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.0262, + "step": 27380 + }, + { + "epoch": 0.7831308077197998, + "grad_norm": 0.37524285912513733, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0342, + "step": 27390 + }, + { + "epoch": 0.7834167262330236, + "grad_norm": 0.4272121787071228, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.0262, + "step": 27400 + }, + { + "epoch": 0.7837026447462473, + "grad_norm": 0.3545357286930084, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.0273, + "step": 27410 + }, + { + "epoch": 0.7839885632594711, + "grad_norm": 0.4780922830104828, + "learning_rate": 2.349511203900333e-06, + "loss": 0.0255, + "step": 27420 + }, + { + "epoch": 0.7842744817726948, + "grad_norm": 0.6846514940261841, + "learning_rate": 2.3468256081258e-06, + "loss": 0.035, + "step": 27430 + }, + { + "epoch": 0.7845604002859186, + "grad_norm": 0.6890650391578674, + "learning_rate": 2.344150167333397e-06, + "loss": 0.0305, + "step": 27440 + }, + { + "epoch": 0.7848463187991422, + "grad_norm": 0.41689804196357727, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0275, + "step": 27450 + }, + { + "epoch": 0.7851322373123659, + "grad_norm": 0.5169947743415833, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0261, + "step": 27460 + }, + { + "epoch": 0.7854181558255897, + "grad_norm": 0.3667839467525482, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0267, + "step": 27470 + }, + { + "epoch": 0.7857040743388134, + "grad_norm": 0.4650583267211914, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0259, + "step": 27480 + }, + { + "epoch": 0.7859899928520372, + "grad_norm": 0.5303590297698975, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0318, + "step": 27490 + }, + { + "epoch": 0.7862759113652609, + "grad_norm": 0.38010939955711365, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.0292, + "step": 27500 + }, + { + "epoch": 0.7865618298784847, + "grad_norm": 0.5952475070953369, + "learning_rate": 2.325706683525094e-06, + "loss": 0.0265, + "step": 27510 + }, + { + "epoch": 0.7868477483917083, + "grad_norm": 0.34000876545906067, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0255, + "step": 27520 + }, + { + "epoch": 0.7871336669049321, + "grad_norm": 0.333310604095459, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0266, + "step": 27530 + }, + { + "epoch": 0.7874195854181558, + "grad_norm": 1.0167195796966553, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0303, + "step": 27540 + }, + { + "epoch": 0.7877055039313795, + "grad_norm": 0.506395697593689, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0255, + "step": 27550 + }, + { + "epoch": 0.7879914224446033, + "grad_norm": 0.4995521008968353, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.0232, + "step": 27560 + }, + { + "epoch": 0.788277340957827, + "grad_norm": 0.592944324016571, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0271, + "step": 27570 + }, + { + "epoch": 0.7885632594710508, + "grad_norm": 0.5690013766288757, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0349, + "step": 27580 + }, + { + "epoch": 0.7888491779842745, + "grad_norm": 0.5303569436073303, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0301, + "step": 27590 + }, + { + "epoch": 0.7891350964974982, + "grad_norm": 0.4314960539340973, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0266, + "step": 27600 + }, + { + "epoch": 0.7894210150107219, + "grad_norm": 0.4138862192630768, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0237, + "step": 27610 + }, + { + "epoch": 0.7897069335239457, + "grad_norm": 0.5151752829551697, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.0268, + "step": 27620 + }, + { + "epoch": 0.7899928520371694, + "grad_norm": 0.7513082027435303, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.031, + "step": 27630 + }, + { + "epoch": 0.7902787705503931, + "grad_norm": 0.2644256055355072, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0259, + "step": 27640 + }, + { + "epoch": 0.7905646890636169, + "grad_norm": 0.5767413377761841, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0312, + "step": 27650 + }, + { + "epoch": 0.7908506075768406, + "grad_norm": 0.4754960536956787, + "learning_rate": 2.287865908463585e-06, + "loss": 0.035, + "step": 27660 + }, + { + "epoch": 0.7911365260900644, + "grad_norm": 0.4080045521259308, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.0271, + "step": 27670 + }, + { + "epoch": 0.791422444603288, + "grad_norm": 0.3843805193901062, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0312, + "step": 27680 + }, + { + "epoch": 0.7917083631165118, + "grad_norm": 0.3925490975379944, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0242, + "step": 27690 + }, + { + "epoch": 0.7919942816297355, + "grad_norm": 0.3966064155101776, + "learning_rate": 2.278163146933236e-06, + "loss": 0.0257, + "step": 27700 + }, + { + "epoch": 0.7922802001429593, + "grad_norm": 0.6077889204025269, + "learning_rate": 2.275763038367336e-06, + "loss": 0.0238, + "step": 27710 + }, + { + "epoch": 0.792566118656183, + "grad_norm": 0.6053628921508789, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0356, + "step": 27720 + }, + { + "epoch": 0.7928520371694067, + "grad_norm": 0.49703511595726013, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0283, + "step": 27730 + }, + { + "epoch": 0.7931379556826305, + "grad_norm": 0.5619977712631226, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0272, + "step": 27740 + }, + { + "epoch": 0.7934238741958541, + "grad_norm": 0.6108564734458923, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0271, + "step": 27750 + }, + { + "epoch": 0.7937097927090779, + "grad_norm": 0.4029979109764099, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0265, + "step": 27760 + }, + { + "epoch": 0.7939957112223016, + "grad_norm": 0.45793306827545166, + "learning_rate": 2.261577490652103e-06, + "loss": 0.0229, + "step": 27770 + }, + { + "epoch": 0.7942816297355254, + "grad_norm": 0.433551162481308, + "learning_rate": 2.259249109209093e-06, + "loss": 0.0264, + "step": 27780 + }, + { + "epoch": 0.7945675482487491, + "grad_norm": 0.4247429072856903, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0269, + "step": 27790 + }, + { + "epoch": 0.7948534667619729, + "grad_norm": 0.4973151981830597, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.0281, + "step": 27800 + }, + { + "epoch": 0.7951393852751966, + "grad_norm": 0.5111087560653687, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.0267, + "step": 27810 + }, + { + "epoch": 0.7954253037884202, + "grad_norm": 0.5530220866203308, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0291, + "step": 27820 + }, + { + "epoch": 0.795711222301644, + "grad_norm": 0.4368492662906647, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0302, + "step": 27830 + }, + { + "epoch": 0.7959971408148677, + "grad_norm": 0.5381907820701599, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0253, + "step": 27840 + }, + { + "epoch": 0.7962830593280915, + "grad_norm": 0.3638664186000824, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.0258, + "step": 27850 + }, + { + "epoch": 0.7965689778413152, + "grad_norm": 0.38014277815818787, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.0279, + "step": 27860 + }, + { + "epoch": 0.796854896354539, + "grad_norm": 0.46882548928260803, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.0272, + "step": 27870 + }, + { + "epoch": 0.7971408148677627, + "grad_norm": 0.4826337397098541, + "learning_rate": 2.236529916369313e-06, + "loss": 0.027, + "step": 27880 + }, + { + "epoch": 0.7974267333809865, + "grad_norm": 0.7986114621162415, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0263, + "step": 27890 + }, + { + "epoch": 0.7977126518942101, + "grad_norm": 0.5447944402694702, + "learning_rate": 2.232109406453595e-06, + "loss": 0.0321, + "step": 27900 + }, + { + "epoch": 0.7979985704074338, + "grad_norm": 0.21586239337921143, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0241, + "step": 27910 + }, + { + "epoch": 0.7982844889206576, + "grad_norm": 0.8066816926002502, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0285, + "step": 27920 + }, + { + "epoch": 0.7985704074338813, + "grad_norm": 0.5516615509986877, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0302, + "step": 27930 + }, + { + "epoch": 0.7988563259471051, + "grad_norm": 0.6859652996063232, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0244, + "step": 27940 + }, + { + "epoch": 0.7991422444603288, + "grad_norm": 0.5234702229499817, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0272, + "step": 27950 + }, + { + "epoch": 0.7994281629735526, + "grad_norm": 0.32633450627326965, + "learning_rate": 2.219094909262834e-06, + "loss": 0.0249, + "step": 27960 + }, + { + "epoch": 0.7997140814867763, + "grad_norm": 0.5086314678192139, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0278, + "step": 27970 + }, + { + "epoch": 0.8, + "grad_norm": 0.40988171100616455, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.0297, + "step": 27980 + }, + { + "epoch": 0.8002859185132237, + "grad_norm": 0.4648076891899109, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0271, + "step": 27990 + }, + { + "epoch": 0.8005718370264474, + "grad_norm": 0.7577387690544128, + "learning_rate": 2.210624641425579e-06, + "loss": 0.0328, + "step": 28000 + }, + { + "epoch": 0.8008577555396712, + "grad_norm": 0.39426741003990173, + "learning_rate": 2.208532855337684e-06, + "loss": 0.0243, + "step": 28010 + }, + { + "epoch": 0.8011436740528949, + "grad_norm": 0.5410818457603455, + "learning_rate": 2.2064513865261646e-06, + "loss": 0.0289, + "step": 28020 + }, + { + "epoch": 0.8014295925661187, + "grad_norm": 0.3485671281814575, + "learning_rate": 2.204380237433745e-06, + "loss": 0.0283, + "step": 28030 + }, + { + "epoch": 0.8017155110793424, + "grad_norm": 0.6367644667625427, + "learning_rate": 2.202319410491029e-06, + "loss": 0.0272, + "step": 28040 + }, + { + "epoch": 0.8020014295925662, + "grad_norm": 0.4387468099594116, + "learning_rate": 2.2002689081165155e-06, + "loss": 0.0278, + "step": 28050 + }, + { + "epoch": 0.8022873481057898, + "grad_norm": 0.7296497821807861, + "learning_rate": 2.1982287327165827e-06, + "loss": 0.0259, + "step": 28060 + }, + { + "epoch": 0.8025732666190136, + "grad_norm": 0.40375930070877075, + "learning_rate": 2.19619888668549e-06, + "loss": 0.0259, + "step": 28070 + }, + { + "epoch": 0.8028591851322373, + "grad_norm": 0.6340100169181824, + "learning_rate": 2.1941793724053733e-06, + "loss": 0.0313, + "step": 28080 + }, + { + "epoch": 0.803145103645461, + "grad_norm": 0.3453208804130554, + "learning_rate": 2.1921701922462463e-06, + "loss": 0.028, + "step": 28090 + }, + { + "epoch": 0.8034310221586848, + "grad_norm": 0.5798079967498779, + "learning_rate": 2.190171348565994e-06, + "loss": 0.0257, + "step": 28100 + }, + { + "epoch": 0.8037169406719085, + "grad_norm": 0.3712709844112396, + "learning_rate": 2.188182843710369e-06, + "loss": 0.0255, + "step": 28110 + }, + { + "epoch": 0.8040028591851323, + "grad_norm": 0.3774068355560303, + "learning_rate": 2.1862046800129964e-06, + "loss": 0.0292, + "step": 28120 + }, + { + "epoch": 0.8042887776983559, + "grad_norm": 0.6050616502761841, + "learning_rate": 2.1842368597953578e-06, + "loss": 0.0292, + "step": 28130 + }, + { + "epoch": 0.8045746962115797, + "grad_norm": 0.3443267047405243, + "learning_rate": 2.1822793853668e-06, + "loss": 0.0459, + "step": 28140 + }, + { + "epoch": 0.8048606147248034, + "grad_norm": 0.5769096612930298, + "learning_rate": 2.18033225902453e-06, + "loss": 0.0277, + "step": 28150 + }, + { + "epoch": 0.8051465332380272, + "grad_norm": 0.5020616054534912, + "learning_rate": 2.17839548305361e-06, + "loss": 0.0273, + "step": 28160 + }, + { + "epoch": 0.8054324517512509, + "grad_norm": 0.3149321973323822, + "learning_rate": 2.1764690597269507e-06, + "loss": 0.0282, + "step": 28170 + }, + { + "epoch": 0.8057183702644746, + "grad_norm": 0.3835159242153168, + "learning_rate": 2.17455299130532e-06, + "loss": 0.0295, + "step": 28180 + }, + { + "epoch": 0.8060042887776984, + "grad_norm": 0.6308208703994751, + "learning_rate": 2.17264728003733e-06, + "loss": 0.0333, + "step": 28190 + }, + { + "epoch": 0.806290207290922, + "grad_norm": 0.4104989767074585, + "learning_rate": 2.17075192815944e-06, + "loss": 0.0246, + "step": 28200 + }, + { + "epoch": 0.8065761258041458, + "grad_norm": 0.5490663051605225, + "learning_rate": 2.168866937895951e-06, + "loss": 0.0243, + "step": 28210 + }, + { + "epoch": 0.8068620443173695, + "grad_norm": 0.44579270482063293, + "learning_rate": 2.166992311459001e-06, + "loss": 0.0297, + "step": 28220 + }, + { + "epoch": 0.8071479628305933, + "grad_norm": 0.38116511702537537, + "learning_rate": 2.1651280510485727e-06, + "loss": 0.0223, + "step": 28230 + }, + { + "epoch": 0.807433881343817, + "grad_norm": 0.5825269222259521, + "learning_rate": 2.163274158852476e-06, + "loss": 0.0289, + "step": 28240 + }, + { + "epoch": 0.8077197998570408, + "grad_norm": 0.396100789308548, + "learning_rate": 2.1614306370463605e-06, + "loss": 0.0287, + "step": 28250 + }, + { + "epoch": 0.8080057183702645, + "grad_norm": 0.3408491611480713, + "learning_rate": 2.1595974877936977e-06, + "loss": 0.0275, + "step": 28260 + }, + { + "epoch": 0.8082916368834882, + "grad_norm": 0.4204134941101074, + "learning_rate": 2.1577747132457933e-06, + "loss": 0.0289, + "step": 28270 + }, + { + "epoch": 0.8085775553967119, + "grad_norm": 1.1906534433364868, + "learning_rate": 2.155962315541773e-06, + "loss": 0.0302, + "step": 28280 + }, + { + "epoch": 0.8088634739099356, + "grad_norm": 0.4449160397052765, + "learning_rate": 2.154160296808588e-06, + "loss": 0.026, + "step": 28290 + }, + { + "epoch": 0.8091493924231594, + "grad_norm": 0.9066163301467896, + "learning_rate": 2.1523686591610064e-06, + "loss": 0.0289, + "step": 28300 + }, + { + "epoch": 0.8094353109363831, + "grad_norm": 0.30709517002105713, + "learning_rate": 2.1505874047016146e-06, + "loss": 0.0218, + "step": 28310 + }, + { + "epoch": 0.8097212294496069, + "grad_norm": 0.3318001329898834, + "learning_rate": 2.1488165355208147e-06, + "loss": 0.0281, + "step": 28320 + }, + { + "epoch": 0.8100071479628306, + "grad_norm": 0.34999215602874756, + "learning_rate": 2.14705605369682e-06, + "loss": 0.0285, + "step": 28330 + }, + { + "epoch": 0.8102930664760544, + "grad_norm": 0.41680973768234253, + "learning_rate": 2.145305961295655e-06, + "loss": 0.0305, + "step": 28340 + }, + { + "epoch": 0.810578984989278, + "grad_norm": 0.3743407428264618, + "learning_rate": 2.143566260371149e-06, + "loss": 0.0256, + "step": 28350 + }, + { + "epoch": 0.8108649035025017, + "grad_norm": 0.872268795967102, + "learning_rate": 2.141836952964938e-06, + "loss": 0.0259, + "step": 28360 + }, + { + "epoch": 0.8111508220157255, + "grad_norm": 0.36687538027763367, + "learning_rate": 2.1401180411064616e-06, + "loss": 0.0279, + "step": 28370 + }, + { + "epoch": 0.8114367405289492, + "grad_norm": 0.511329174041748, + "learning_rate": 2.138409526812959e-06, + "loss": 0.0275, + "step": 28380 + }, + { + "epoch": 0.811722659042173, + "grad_norm": 0.3234724998474121, + "learning_rate": 2.1367114120894663e-06, + "loss": 0.0257, + "step": 28390 + }, + { + "epoch": 0.8120085775553967, + "grad_norm": 0.5732539296150208, + "learning_rate": 2.1350236989288136e-06, + "loss": 0.0288, + "step": 28400 + }, + { + "epoch": 0.8122944960686205, + "grad_norm": 0.4985447824001312, + "learning_rate": 2.1333463893116294e-06, + "loss": 0.0248, + "step": 28410 + }, + { + "epoch": 0.8125804145818442, + "grad_norm": 0.49544450640678406, + "learning_rate": 2.131679485206329e-06, + "loss": 0.0295, + "step": 28420 + }, + { + "epoch": 0.812866333095068, + "grad_norm": 1.0728929042816162, + "learning_rate": 2.130022988569117e-06, + "loss": 0.0313, + "step": 28430 + }, + { + "epoch": 0.8131522516082916, + "grad_norm": 0.2358589768409729, + "learning_rate": 2.128376901343984e-06, + "loss": 0.0218, + "step": 28440 + }, + { + "epoch": 0.8134381701215153, + "grad_norm": 0.518035352230072, + "learning_rate": 2.1267412254627056e-06, + "loss": 0.0292, + "step": 28450 + }, + { + "epoch": 0.8137240886347391, + "grad_norm": 0.43305718898773193, + "learning_rate": 2.1251159628448386e-06, + "loss": 0.0301, + "step": 28460 + }, + { + "epoch": 0.8140100071479628, + "grad_norm": 0.7385976314544678, + "learning_rate": 2.1235011153977192e-06, + "loss": 0.0263, + "step": 28470 + }, + { + "epoch": 0.8142959256611866, + "grad_norm": 0.435623437166214, + "learning_rate": 2.121896685016461e-06, + "loss": 0.0283, + "step": 28480 + }, + { + "epoch": 0.8145818441744103, + "grad_norm": 0.5866786241531372, + "learning_rate": 2.1203026735839514e-06, + "loss": 0.0257, + "step": 28490 + }, + { + "epoch": 0.8148677626876341, + "grad_norm": 0.8038771152496338, + "learning_rate": 2.118719082970852e-06, + "loss": 0.0315, + "step": 28500 + }, + { + "epoch": 0.8151536812008577, + "grad_norm": 0.33963197469711304, + "learning_rate": 2.1171459150355947e-06, + "loss": 0.0306, + "step": 28510 + }, + { + "epoch": 0.8154395997140815, + "grad_norm": 0.5540177822113037, + "learning_rate": 2.115583171624381e-06, + "loss": 0.0263, + "step": 28520 + }, + { + "epoch": 0.8157255182273052, + "grad_norm": 0.5438565015792847, + "learning_rate": 2.114030854571176e-06, + "loss": 0.0307, + "step": 28530 + }, + { + "epoch": 0.8160114367405289, + "grad_norm": 0.36572158336639404, + "learning_rate": 2.1124889656977097e-06, + "loss": 0.0265, + "step": 28540 + }, + { + "epoch": 0.8162973552537527, + "grad_norm": 0.27488043904304504, + "learning_rate": 2.1109575068134756e-06, + "loss": 0.0243, + "step": 28550 + }, + { + "epoch": 0.8165832737669764, + "grad_norm": 0.5578693151473999, + "learning_rate": 2.1094364797157267e-06, + "loss": 0.0317, + "step": 28560 + }, + { + "epoch": 0.8168691922802002, + "grad_norm": 0.7271984815597534, + "learning_rate": 2.107925886189472e-06, + "loss": 0.0246, + "step": 28570 + }, + { + "epoch": 0.8171551107934238, + "grad_norm": 0.8810591697692871, + "learning_rate": 2.1064257280074763e-06, + "loss": 0.0285, + "step": 28580 + }, + { + "epoch": 0.8174410293066476, + "grad_norm": 0.43811503052711487, + "learning_rate": 2.1049360069302594e-06, + "loss": 0.0264, + "step": 28590 + }, + { + "epoch": 0.8177269478198713, + "grad_norm": 0.4820844531059265, + "learning_rate": 2.1034567247060926e-06, + "loss": 0.0292, + "step": 28600 + }, + { + "epoch": 0.8180128663330951, + "grad_norm": 0.4385477900505066, + "learning_rate": 2.1019878830709968e-06, + "loss": 0.026, + "step": 28610 + }, + { + "epoch": 0.8182987848463188, + "grad_norm": 0.6242631673812866, + "learning_rate": 2.100529483748737e-06, + "loss": 0.0275, + "step": 28620 + }, + { + "epoch": 0.8185847033595425, + "grad_norm": 0.5929499864578247, + "learning_rate": 2.099081528450828e-06, + "loss": 0.0228, + "step": 28630 + }, + { + "epoch": 0.8188706218727663, + "grad_norm": 0.9688727259635925, + "learning_rate": 2.097644018876524e-06, + "loss": 0.0233, + "step": 28640 + }, + { + "epoch": 0.81915654038599, + "grad_norm": 0.3581937849521637, + "learning_rate": 2.096216956712826e-06, + "loss": 0.0257, + "step": 28650 + }, + { + "epoch": 0.8194424588992137, + "grad_norm": 0.29479968547821045, + "learning_rate": 2.0948003436344666e-06, + "loss": 0.0275, + "step": 28660 + }, + { + "epoch": 0.8197283774124374, + "grad_norm": 0.5298082232475281, + "learning_rate": 2.0933941813039244e-06, + "loss": 0.0282, + "step": 28670 + }, + { + "epoch": 0.8200142959256612, + "grad_norm": 0.3596552610397339, + "learning_rate": 2.091998471371406e-06, + "loss": 0.0279, + "step": 28680 + }, + { + "epoch": 0.8203002144388849, + "grad_norm": 0.5539126396179199, + "learning_rate": 2.0906132154748557e-06, + "loss": 0.0233, + "step": 28690 + }, + { + "epoch": 0.8205861329521087, + "grad_norm": 0.7187175154685974, + "learning_rate": 2.0892384152399504e-06, + "loss": 0.0352, + "step": 28700 + }, + { + "epoch": 0.8208720514653324, + "grad_norm": 0.5331593155860901, + "learning_rate": 2.0878740722800917e-06, + "loss": 0.0258, + "step": 28710 + }, + { + "epoch": 0.8211579699785561, + "grad_norm": 0.7345555424690247, + "learning_rate": 2.086520188196413e-06, + "loss": 0.0332, + "step": 28720 + }, + { + "epoch": 0.8214438884917799, + "grad_norm": 0.3517567217350006, + "learning_rate": 2.085176764577774e-06, + "loss": 0.0297, + "step": 28730 + }, + { + "epoch": 0.8217298070050035, + "grad_norm": 0.5344868302345276, + "learning_rate": 2.083843803000755e-06, + "loss": 0.031, + "step": 28740 + }, + { + "epoch": 0.8220157255182273, + "grad_norm": 0.34053468704223633, + "learning_rate": 2.0825213050296636e-06, + "loss": 0.0262, + "step": 28750 + }, + { + "epoch": 0.822301644031451, + "grad_norm": 0.5699623227119446, + "learning_rate": 2.081209272216522e-06, + "loss": 0.0304, + "step": 28760 + }, + { + "epoch": 0.8225875625446748, + "grad_norm": 0.788611888885498, + "learning_rate": 2.079907706101075e-06, + "loss": 0.0301, + "step": 28770 + }, + { + "epoch": 0.8228734810578985, + "grad_norm": 0.43659698963165283, + "learning_rate": 2.0786166082107833e-06, + "loss": 0.0264, + "step": 28780 + }, + { + "epoch": 0.8231593995711223, + "grad_norm": 0.4736769199371338, + "learning_rate": 2.0773359800608217e-06, + "loss": 0.0255, + "step": 28790 + }, + { + "epoch": 0.823445318084346, + "grad_norm": 0.39587756991386414, + "learning_rate": 2.076065823154079e-06, + "loss": 0.0273, + "step": 28800 + }, + { + "epoch": 0.8237312365975696, + "grad_norm": 0.4977654218673706, + "learning_rate": 2.0748061389811543e-06, + "loss": 0.0284, + "step": 28810 + }, + { + "epoch": 0.8240171551107934, + "grad_norm": 0.6429978013038635, + "learning_rate": 2.073556929020357e-06, + "loss": 0.0284, + "step": 28820 + }, + { + "epoch": 0.8243030736240171, + "grad_norm": 0.41012001037597656, + "learning_rate": 2.0723181947377057e-06, + "loss": 0.0249, + "step": 28830 + }, + { + "epoch": 0.8245889921372409, + "grad_norm": 0.4937886595726013, + "learning_rate": 2.0710899375869237e-06, + "loss": 0.0321, + "step": 28840 + }, + { + "epoch": 0.8248749106504646, + "grad_norm": 0.3416379988193512, + "learning_rate": 2.0698721590094387e-06, + "loss": 0.0261, + "step": 28850 + }, + { + "epoch": 0.8251608291636884, + "grad_norm": 0.6058022379875183, + "learning_rate": 2.0686648604343824e-06, + "loss": 0.0261, + "step": 28860 + }, + { + "epoch": 0.8254467476769121, + "grad_norm": 0.5241106152534485, + "learning_rate": 2.067468043278587e-06, + "loss": 0.0268, + "step": 28870 + }, + { + "epoch": 0.8257326661901359, + "grad_norm": 0.4829743802547455, + "learning_rate": 2.066281708946583e-06, + "loss": 0.0302, + "step": 28880 + }, + { + "epoch": 0.8260185847033595, + "grad_norm": 0.5692874789237976, + "learning_rate": 2.0651058588306007e-06, + "loss": 0.0283, + "step": 28890 + }, + { + "epoch": 0.8263045032165832, + "grad_norm": 0.3703140914440155, + "learning_rate": 2.063940494310565e-06, + "loss": 0.0236, + "step": 28900 + }, + { + "epoch": 0.826590421729807, + "grad_norm": 0.44660329818725586, + "learning_rate": 2.062785616754097e-06, + "loss": 0.024, + "step": 28910 + }, + { + "epoch": 0.8268763402430307, + "grad_norm": 0.5036526918411255, + "learning_rate": 2.0616412275165097e-06, + "loss": 0.0259, + "step": 28920 + }, + { + "epoch": 0.8271622587562545, + "grad_norm": 0.40677565336227417, + "learning_rate": 2.0605073279408063e-06, + "loss": 0.0287, + "step": 28930 + }, + { + "epoch": 0.8274481772694782, + "grad_norm": 0.37139707803726196, + "learning_rate": 2.0593839193576833e-06, + "loss": 0.0274, + "step": 28940 + }, + { + "epoch": 0.827734095782702, + "grad_norm": 0.4395834803581238, + "learning_rate": 2.058271003085521e-06, + "loss": 0.0243, + "step": 28950 + }, + { + "epoch": 0.8280200142959256, + "grad_norm": 0.41849637031555176, + "learning_rate": 2.0571685804303905e-06, + "loss": 0.0364, + "step": 28960 + }, + { + "epoch": 0.8283059328091494, + "grad_norm": 0.532619833946228, + "learning_rate": 2.0560766526860447e-06, + "loss": 0.0257, + "step": 28970 + }, + { + "epoch": 0.8285918513223731, + "grad_norm": 0.6253917813301086, + "learning_rate": 2.054995221133923e-06, + "loss": 0.0276, + "step": 28980 + }, + { + "epoch": 0.8288777698355968, + "grad_norm": 0.5655578374862671, + "learning_rate": 2.053924287043144e-06, + "loss": 0.0255, + "step": 28990 + }, + { + "epoch": 0.8291636883488206, + "grad_norm": 0.46769171953201294, + "learning_rate": 2.0528638516705106e-06, + "loss": 0.0303, + "step": 29000 + }, + { + "epoch": 0.8294496068620443, + "grad_norm": 0.5021181702613831, + "learning_rate": 2.051813916260501e-06, + "loss": 0.0287, + "step": 29010 + }, + { + "epoch": 0.8297355253752681, + "grad_norm": 0.6354414820671082, + "learning_rate": 2.050774482045273e-06, + "loss": 0.0243, + "step": 29020 + }, + { + "epoch": 0.8300214438884918, + "grad_norm": 0.3923643231391907, + "learning_rate": 2.049745550244661e-06, + "loss": 0.027, + "step": 29030 + }, + { + "epoch": 0.8303073624017155, + "grad_norm": 0.7594497203826904, + "learning_rate": 2.0487271220661735e-06, + "loss": 0.0303, + "step": 29040 + }, + { + "epoch": 0.8305932809149392, + "grad_norm": 1.0209988355636597, + "learning_rate": 2.047719198704994e-06, + "loss": 0.0319, + "step": 29050 + }, + { + "epoch": 0.830879199428163, + "grad_norm": 0.4946758449077606, + "learning_rate": 2.0467217813439762e-06, + "loss": 0.0238, + "step": 29060 + }, + { + "epoch": 0.8311651179413867, + "grad_norm": 0.3641301095485687, + "learning_rate": 2.0457348711536426e-06, + "loss": 0.0299, + "step": 29070 + }, + { + "epoch": 0.8314510364546104, + "grad_norm": 0.4918605387210846, + "learning_rate": 2.0447584692921894e-06, + "loss": 0.0275, + "step": 29080 + }, + { + "epoch": 0.8317369549678342, + "grad_norm": 0.5097633004188538, + "learning_rate": 2.043792576905478e-06, + "loss": 0.0301, + "step": 29090 + }, + { + "epoch": 0.8320228734810579, + "grad_norm": 0.7338542938232422, + "learning_rate": 2.0428371951270394e-06, + "loss": 0.0261, + "step": 29100 + }, + { + "epoch": 0.8323087919942816, + "grad_norm": 0.5453478097915649, + "learning_rate": 2.0418923250780633e-06, + "loss": 0.0279, + "step": 29110 + }, + { + "epoch": 0.8325947105075053, + "grad_norm": 0.38167792558670044, + "learning_rate": 2.0409579678674084e-06, + "loss": 0.0245, + "step": 29120 + }, + { + "epoch": 0.8328806290207291, + "grad_norm": 0.75771164894104, + "learning_rate": 2.040034124591597e-06, + "loss": 0.0305, + "step": 29130 + }, + { + "epoch": 0.8331665475339528, + "grad_norm": 0.820286214351654, + "learning_rate": 2.039120796334809e-06, + "loss": 0.0277, + "step": 29140 + }, + { + "epoch": 0.8334524660471766, + "grad_norm": 0.26554685831069946, + "learning_rate": 2.0382179841688868e-06, + "loss": 0.0309, + "step": 29150 + }, + { + "epoch": 0.8337383845604003, + "grad_norm": 0.5767927765846252, + "learning_rate": 2.0373256891533293e-06, + "loss": 0.0264, + "step": 29160 + }, + { + "epoch": 0.834024303073624, + "grad_norm": 0.45167428255081177, + "learning_rate": 2.0364439123352956e-06, + "loss": 0.0313, + "step": 29170 + }, + { + "epoch": 0.8343102215868478, + "grad_norm": 0.49742552638053894, + "learning_rate": 2.0355726547495998e-06, + "loss": 0.0263, + "step": 29180 + }, + { + "epoch": 0.8345961401000714, + "grad_norm": 0.3473125398159027, + "learning_rate": 2.034711917418711e-06, + "loss": 0.0329, + "step": 29190 + }, + { + "epoch": 0.8348820586132952, + "grad_norm": 0.6035035848617554, + "learning_rate": 2.033861701352752e-06, + "loss": 0.027, + "step": 29200 + }, + { + "epoch": 0.8351679771265189, + "grad_norm": 0.4132099449634552, + "learning_rate": 2.0330220075494992e-06, + "loss": 0.0284, + "step": 29210 + }, + { + "epoch": 0.8354538956397427, + "grad_norm": 0.3904581367969513, + "learning_rate": 2.0321928369943807e-06, + "loss": 0.0271, + "step": 29220 + }, + { + "epoch": 0.8357398141529664, + "grad_norm": 0.40810489654541016, + "learning_rate": 2.031374190660474e-06, + "loss": 0.026, + "step": 29230 + }, + { + "epoch": 0.8360257326661902, + "grad_norm": 0.5281389355659485, + "learning_rate": 2.0305660695085054e-06, + "loss": 0.0264, + "step": 29240 + }, + { + "epoch": 0.8363116511794139, + "grad_norm": 0.3951243758201599, + "learning_rate": 2.0297684744868494e-06, + "loss": 0.0272, + "step": 29250 + }, + { + "epoch": 0.8365975696926375, + "grad_norm": 0.3633115291595459, + "learning_rate": 2.0289814065315306e-06, + "loss": 0.0248, + "step": 29260 + }, + { + "epoch": 0.8368834882058613, + "grad_norm": 0.7965249419212341, + "learning_rate": 2.0282048665662153e-06, + "loss": 0.0443, + "step": 29270 + }, + { + "epoch": 0.837169406719085, + "grad_norm": 0.6424257159233093, + "learning_rate": 2.0274388555022176e-06, + "loss": 0.0257, + "step": 29280 + }, + { + "epoch": 0.8374553252323088, + "grad_norm": 0.4431753158569336, + "learning_rate": 2.0266833742384928e-06, + "loss": 0.0309, + "step": 29290 + }, + { + "epoch": 0.8377412437455325, + "grad_norm": 0.6503756046295166, + "learning_rate": 2.0259384236616404e-06, + "loss": 0.0275, + "step": 29300 + }, + { + "epoch": 0.8380271622587563, + "grad_norm": 0.5955492258071899, + "learning_rate": 2.0252040046459022e-06, + "loss": 0.0282, + "step": 29310 + }, + { + "epoch": 0.83831308077198, + "grad_norm": 0.3691128194332123, + "learning_rate": 2.02448011805316e-06, + "loss": 0.0321, + "step": 29320 + }, + { + "epoch": 0.8385989992852038, + "grad_norm": 0.42951759696006775, + "learning_rate": 2.023766764732934e-06, + "loss": 0.0254, + "step": 29330 + }, + { + "epoch": 0.8388849177984274, + "grad_norm": 0.5496651530265808, + "learning_rate": 2.0230639455223853e-06, + "loss": 0.0273, + "step": 29340 + }, + { + "epoch": 0.8391708363116511, + "grad_norm": 0.44067010283470154, + "learning_rate": 2.0223716612463095e-06, + "loss": 0.0253, + "step": 29350 + }, + { + "epoch": 0.8394567548248749, + "grad_norm": 0.5913621783256531, + "learning_rate": 2.0216899127171424e-06, + "loss": 0.0244, + "step": 29360 + }, + { + "epoch": 0.8397426733380986, + "grad_norm": 0.5189345479011536, + "learning_rate": 2.0210187007349534e-06, + "loss": 0.0278, + "step": 29370 + }, + { + "epoch": 0.8400285918513224, + "grad_norm": 0.5561279058456421, + "learning_rate": 2.0203580260874474e-06, + "loss": 0.0293, + "step": 29380 + }, + { + "epoch": 0.8403145103645461, + "grad_norm": 0.39133086800575256, + "learning_rate": 2.019707889549963e-06, + "loss": 0.0249, + "step": 29390 + }, + { + "epoch": 0.8406004288777699, + "grad_norm": 0.47845765948295593, + "learning_rate": 2.01906829188547e-06, + "loss": 0.0265, + "step": 29400 + }, + { + "epoch": 0.8408863473909935, + "grad_norm": 0.37436914443969727, + "learning_rate": 2.018439233844574e-06, + "loss": 0.0242, + "step": 29410 + }, + { + "epoch": 0.8411722659042173, + "grad_norm": 0.3853163719177246, + "learning_rate": 2.0178207161655087e-06, + "loss": 0.0235, + "step": 29420 + }, + { + "epoch": 0.841458184417441, + "grad_norm": 0.422373503446579, + "learning_rate": 2.0172127395741398e-06, + "loss": 0.0289, + "step": 29430 + }, + { + "epoch": 0.8417441029306647, + "grad_norm": 0.4003738462924957, + "learning_rate": 2.0166153047839603e-06, + "loss": 0.031, + "step": 29440 + }, + { + "epoch": 0.8420300214438885, + "grad_norm": 0.4747001528739929, + "learning_rate": 2.016028412496094e-06, + "loss": 0.0245, + "step": 29450 + }, + { + "epoch": 0.8423159399571122, + "grad_norm": 0.35080668330192566, + "learning_rate": 2.015452063399292e-06, + "loss": 0.0312, + "step": 29460 + }, + { + "epoch": 0.842601858470336, + "grad_norm": 0.406414270401001, + "learning_rate": 2.014886258169932e-06, + "loss": 0.0322, + "step": 29470 + }, + { + "epoch": 0.8428877769835597, + "grad_norm": 0.43083736300468445, + "learning_rate": 2.014330997472017e-06, + "loss": 0.0259, + "step": 29480 + }, + { + "epoch": 0.8431736954967834, + "grad_norm": 0.317290723323822, + "learning_rate": 2.013786281957177e-06, + "loss": 0.0258, + "step": 29490 + }, + { + "epoch": 0.8434596140100071, + "grad_norm": 0.38952547311782837, + "learning_rate": 2.0132521122646662e-06, + "loss": 0.0249, + "step": 29500 + }, + { + "epoch": 0.8437455325232309, + "grad_norm": 0.331080824136734, + "learning_rate": 2.0127284890213623e-06, + "loss": 0.0251, + "step": 29510 + }, + { + "epoch": 0.8440314510364546, + "grad_norm": 0.4097452163696289, + "learning_rate": 2.012215412841767e-06, + "loss": 0.0252, + "step": 29520 + }, + { + "epoch": 0.8443173695496783, + "grad_norm": 0.5636009573936462, + "learning_rate": 2.011712884328003e-06, + "loss": 0.0261, + "step": 29530 + }, + { + "epoch": 0.8446032880629021, + "grad_norm": 0.516639769077301, + "learning_rate": 2.011220904069815e-06, + "loss": 0.0318, + "step": 29540 + }, + { + "epoch": 0.8448892065761258, + "grad_norm": 0.3075452446937561, + "learning_rate": 2.01073947264457e-06, + "loss": 0.0229, + "step": 29550 + }, + { + "epoch": 0.8451751250893496, + "grad_norm": 0.38183867931365967, + "learning_rate": 2.0102685906172543e-06, + "loss": 0.0266, + "step": 29560 + }, + { + "epoch": 0.8454610436025732, + "grad_norm": 0.3114834129810333, + "learning_rate": 2.009808258540475e-06, + "loss": 0.0261, + "step": 29570 + }, + { + "epoch": 0.845746962115797, + "grad_norm": 1.277812123298645, + "learning_rate": 2.009358476954456e-06, + "loss": 0.0341, + "step": 29580 + }, + { + "epoch": 0.8460328806290207, + "grad_norm": 0.44270217418670654, + "learning_rate": 2.008919246387043e-06, + "loss": 0.0279, + "step": 29590 + }, + { + "epoch": 0.8463187991422445, + "grad_norm": 0.9557573199272156, + "learning_rate": 2.0084905673536952e-06, + "loss": 0.0283, + "step": 29600 + }, + { + "epoch": 0.8466047176554682, + "grad_norm": 0.6227599382400513, + "learning_rate": 2.0080724403574922e-06, + "loss": 0.0267, + "step": 29610 + }, + { + "epoch": 0.8468906361686919, + "grad_norm": 0.5279103517532349, + "learning_rate": 2.007664865889131e-06, + "loss": 0.0267, + "step": 29620 + }, + { + "epoch": 0.8471765546819157, + "grad_norm": 0.9109275937080383, + "learning_rate": 2.0072678444269208e-06, + "loss": 0.0336, + "step": 29630 + }, + { + "epoch": 0.8474624731951393, + "grad_norm": 0.26767420768737793, + "learning_rate": 2.006881376436789e-06, + "loss": 0.0249, + "step": 29640 + }, + { + "epoch": 0.8477483917083631, + "grad_norm": 0.3451564610004425, + "learning_rate": 2.0065054623722772e-06, + "loss": 0.0239, + "step": 29650 + }, + { + "epoch": 0.8480343102215868, + "grad_norm": 0.47498252987861633, + "learning_rate": 2.0061401026745425e-06, + "loss": 0.0309, + "step": 29660 + }, + { + "epoch": 0.8483202287348106, + "grad_norm": 0.49723026156425476, + "learning_rate": 2.005785297772354e-06, + "loss": 0.0277, + "step": 29670 + }, + { + "epoch": 0.8486061472480343, + "grad_norm": 0.365113228559494, + "learning_rate": 2.005441048082095e-06, + "loss": 0.0271, + "step": 29680 + }, + { + "epoch": 0.8488920657612581, + "grad_norm": 0.29341810941696167, + "learning_rate": 2.0051073540077617e-06, + "loss": 0.0272, + "step": 29690 + }, + { + "epoch": 0.8491779842744818, + "grad_norm": 0.4523782432079315, + "learning_rate": 2.0047842159409633e-06, + "loss": 0.0314, + "step": 29700 + }, + { + "epoch": 0.8494639027877054, + "grad_norm": 0.3393439054489136, + "learning_rate": 2.004471634260919e-06, + "loss": 0.0279, + "step": 29710 + }, + { + "epoch": 0.8497498213009292, + "grad_norm": 0.4606449007987976, + "learning_rate": 2.004169609334462e-06, + "loss": 0.0331, + "step": 29720 + }, + { + "epoch": 0.8500357398141529, + "grad_norm": 0.35131368041038513, + "learning_rate": 2.003878141516035e-06, + "loss": 0.0299, + "step": 29730 + }, + { + "epoch": 0.8503216583273767, + "grad_norm": 0.34658634662628174, + "learning_rate": 2.0035972311476916e-06, + "loss": 0.0288, + "step": 29740 + }, + { + "epoch": 0.8506075768406004, + "grad_norm": 0.3298470675945282, + "learning_rate": 2.0033268785590954e-06, + "loss": 0.0249, + "step": 29750 + }, + { + "epoch": 0.8508934953538242, + "grad_norm": 0.44585004448890686, + "learning_rate": 2.003067084067522e-06, + "loss": 0.0284, + "step": 29760 + }, + { + "epoch": 0.8511794138670479, + "grad_norm": 0.8015826940536499, + "learning_rate": 2.0028178479778523e-06, + "loss": 0.027, + "step": 29770 + }, + { + "epoch": 0.8514653323802717, + "grad_norm": 0.5542123913764954, + "learning_rate": 2.0025791705825805e-06, + "loss": 0.0261, + "step": 29780 + }, + { + "epoch": 0.8517512508934953, + "grad_norm": 0.5209313035011292, + "learning_rate": 2.0023510521618066e-06, + "loss": 0.0276, + "step": 29790 + }, + { + "epoch": 0.852037169406719, + "grad_norm": 0.3215568959712982, + "learning_rate": 2.0021334929832407e-06, + "loss": 0.0295, + "step": 29800 + }, + { + "epoch": 0.8523230879199428, + "grad_norm": 0.48849478363990784, + "learning_rate": 2.0019264933022016e-06, + "loss": 0.0247, + "step": 29810 + }, + { + "epoch": 0.8526090064331665, + "grad_norm": 0.32018014788627625, + "learning_rate": 2.001730053361614e-06, + "loss": 0.0241, + "step": 29820 + }, + { + "epoch": 0.8528949249463903, + "grad_norm": 0.5317611694335938, + "learning_rate": 2.0015441733920105e-06, + "loss": 0.0267, + "step": 29830 + }, + { + "epoch": 0.853180843459614, + "grad_norm": 0.691618025302887, + "learning_rate": 2.0013688536115332e-06, + "loss": 0.0304, + "step": 29840 + }, + { + "epoch": 0.8534667619728378, + "grad_norm": 0.4876650273799896, + "learning_rate": 2.0012040942259285e-06, + "loss": 0.0264, + "step": 29850 + }, + { + "epoch": 0.8537526804860615, + "grad_norm": 0.464668333530426, + "learning_rate": 2.0010498954285506e-06, + "loss": 0.0247, + "step": 29860 + }, + { + "epoch": 0.8540385989992852, + "grad_norm": 0.694965124130249, + "learning_rate": 2.00090625740036e-06, + "loss": 0.0254, + "step": 29870 + }, + { + "epoch": 0.8543245175125089, + "grad_norm": 0.48797327280044556, + "learning_rate": 2.0007731803099256e-06, + "loss": 0.0325, + "step": 29880 + }, + { + "epoch": 0.8546104360257326, + "grad_norm": 0.3835223913192749, + "learning_rate": 2.00065066431342e-06, + "loss": 0.0292, + "step": 29890 + }, + { + "epoch": 0.8548963545389564, + "grad_norm": 0.5236513614654541, + "learning_rate": 2.0005387095546222e-06, + "loss": 0.029, + "step": 29900 + }, + { + "epoch": 0.8551822730521801, + "grad_norm": 1.3250213861465454, + "learning_rate": 2.000437316164917e-06, + "loss": 0.0282, + "step": 29910 + }, + { + "epoch": 0.8554681915654039, + "grad_norm": 0.24530036747455597, + "learning_rate": 2.000346484263297e-06, + "loss": 0.0306, + "step": 29920 + }, + { + "epoch": 0.8557541100786276, + "grad_norm": 0.3535238802433014, + "learning_rate": 2.0002662139563564e-06, + "loss": 0.0252, + "step": 29930 + }, + { + "epoch": 0.8560400285918514, + "grad_norm": 0.5203806161880493, + "learning_rate": 2.0001965053382976e-06, + "loss": 0.0294, + "step": 29940 + }, + { + "epoch": 0.856325947105075, + "grad_norm": 0.6543874144554138, + "learning_rate": 2.000137358490928e-06, + "loss": 0.0284, + "step": 29950 + }, + { + "epoch": 0.8566118656182988, + "grad_norm": 0.40720510482788086, + "learning_rate": 2.0000887734836583e-06, + "loss": 0.0305, + "step": 29960 + }, + { + "epoch": 0.8568977841315225, + "grad_norm": 0.5478134155273438, + "learning_rate": 2.0000507503735076e-06, + "loss": 0.0281, + "step": 29970 + }, + { + "epoch": 0.8571837026447462, + "grad_norm": 0.35732871294021606, + "learning_rate": 2.0000232892050976e-06, + "loss": 0.0273, + "step": 29980 + }, + { + "epoch": 0.85746962115797, + "grad_norm": 0.39969056844711304, + "learning_rate": 2.000006390010655e-06, + "loss": 0.0229, + "step": 29990 + }, + { + "epoch": 0.8577555396711937, + "grad_norm": 0.3445800840854645, + "learning_rate": 2.0000000528100118e-06, + "loss": 0.0269, + "step": 30000 + }, + { + "epoch": 0.8577555396711937, + "step": 30000, + "total_flos": 1.87391671271424e+17, + "train_loss": 0.04361908026635647, + "train_runtime": 19998.4591, + "train_samples_per_second": 12.001, + "train_steps_per_second": 1.5 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.87391671271424e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/training_args.bin b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..9a8e9db2fc8c02e02c3d9dc8ab6720ad303a5b3a --- /dev/null +++ b/libero_object_two_extra_pi0_VIS_PROJ_HEAD/libero_object_two_extra_pi0_20260201-161941_lr2e-05_batchsize8/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:612ba70c7690571cb25b3741b149289d0da6675f330268700d4dd75e92ecc19a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a860a7d89303338797caa8f409e685aa2987e95 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02013c98e7ce3d8f711c653e8de0a1bf58e014218bf2b3794f6d8e1abccc7090 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b2aece3ccbce99ef19f8aaa4f145f463d399c74b --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67f042fad6885ce59c1a7b66c114f6ca95aefb8b272572ccfbb9fefa60faa196 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1058ae09b65f8b577a1dae303d9a418d054f18c9 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e824e737a72fd083d7d169ea7b3a6a52b7993776f71c1fd6a89b6426fcbb2bc +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6d9c5389113265fa0352268f4a20d63826a6f9a4 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/trainer_state.json @@ -0,0 +1,8434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7190364911019235, + "eval_steps": 500, + "global_step": 12000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.49566685085696e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-12000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..44eccb2a47c9ed792c9a01449c2486989c3252a3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcd0f26fa804f10c07ba6d24a8c0cadec554aee79e85f95b9b022eeb8af7ca79 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2715c789db8808052221cebb6e4ff2a2737f34bb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abb05b45100df207a73d5a81263572e10ad5439e79ffc2305cab9b7649cf264b +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c69a60c6857a0e502d73f0dc1ea2779133ddab51 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5fd8a48c472d551381317797e80386b3b3ce78c3007d88233aa7b8a56ee10b4 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a9eb530d797ed2467a1391766c9fbb914e971f53 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/trainer_state.json @@ -0,0 +1,9834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8388759062855773, + "eval_steps": 500, + "global_step": 14000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.74494465933312e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-14000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dc3a28b95911ed93d90dfaa81f1813f8badf2edd --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e63bcd0c108e2d897b58726aa165c6b4ef303c43a6f7fb2e2f2b60bd0e382b8 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fc1c06aced30f8e8c8e29353cb9b5a9d7db50104 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9f00e7ca9b9c0acf22ce49c09e30e73fedbbedd4351c7ead3e24f914d6daf85 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..98d153ef5668b37d4f6b043b164bce62c0450fd3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4661f48285cf25500fc7aab0ff4675fc180b11422262d83d1874f1941f56a4e +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b6a3eb30e0ff162f2dd1b64a8b8282ad9ffbcccc --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/trainer_state.json @@ -0,0 +1,11234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9587153214692312, + "eval_steps": 500, + "global_step": 16000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.7277513146400452, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0304, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.6415050029754639, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0292, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.48691895604133606, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0337, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.53068608045578, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0338, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.5464624762535095, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0303, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.3911614418029785, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0345, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.6894099116325378, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0365, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.5268317461013794, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0405, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.8635499477386475, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0321, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21542859077453613, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0264, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.6257337331771851, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0355, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.6525475978851318, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0304, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.4599299430847168, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0314, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.7497361898422241, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.031, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.3124896287918091, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0257, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.6170748472213745, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0323, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.4619428515434265, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0315, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.5088011026382446, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0255, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.5397948622703552, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0265, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.457082062959671, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0279, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.4131294786930084, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0269, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 1.1949660778045654, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.6057063341140747, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0306, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.26918280124664307, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0283, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.48841091990470886, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0323, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.6195886135101318, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0295, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.5798623561859131, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.031, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.4877539277076721, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0267, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.33261221647262573, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0261, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.8361077904701233, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0311, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.305922269821167, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0302, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.22662357985973358, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.028, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.4273515045642853, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0307, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.521216869354248, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0277, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.7090896368026733, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0346, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.3693661391735077, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.3651321530342102, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0263, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.5577923655509949, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0357, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.6504148840904236, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0404, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.49205282330513, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.035, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.6053458452224731, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0328, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.5949649214744568, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0302, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.5310356020927429, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0264, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4087911546230316, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.35929426550865173, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0274, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.5112904906272888, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0253, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.39148232340812683, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0305, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.47718697786331177, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.620936393737793, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0289, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.8953443169593811, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0328, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.4663226902484894, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0302, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.707167387008667, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0319, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.5325813889503479, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0318, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.6239158511161804, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0289, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.38823947310447693, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0266, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.48849165439605713, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0234, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.23214028775691986, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0276, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3467197120189667, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0282, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.2009357064962387, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0298, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.8589951395988464, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0264, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.43969056010246277, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0292, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.5750611424446106, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0289, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.5399556756019592, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0307, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.20517395436763763, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0249, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.7490189671516418, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0246, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.6661257743835449, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0325, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.571394681930542, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0342, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.8792482018470764, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0332, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.5770248770713806, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0286, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.62962406873703, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0246, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.4651380479335785, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.037, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.5087499022483826, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0265, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.44421979784965515, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0306, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.6521517038345337, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0334, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.5384942889213562, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0296, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.41909387707710266, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0297, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.6697047352790833, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0331, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.4015032947063446, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0326, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.48070228099823, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0278, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.8651071786880493, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0242, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 1.17703378200531, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0288, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.45865103602409363, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0322, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.41243845224380493, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0297, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.482997864484787, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0305, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.5319142937660217, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.6116752028465271, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0311, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.4214901328086853, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0269, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.6246733069419861, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.026, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.4263368248939514, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0305, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.4059041738510132, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.022, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.6362516283988953, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0265, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.2905973494052887, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0297, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.42270833253860474, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0255, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.26410749554634094, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0252, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.7570974230766296, + "learning_rate": 1.153689339251154e-05, + "loss": 0.027, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.5941224098205566, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.3985750079154968, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0337, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.3877560496330261, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.024, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.44742006063461304, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0284, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.3280893564224243, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0318, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0341, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.4976208806037903, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0239, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.6153465509414673, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0252, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.6112402677536011, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.4973732531070709, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0307, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.5871816277503967, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0254, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 1.2150986194610596, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.6406526565551758, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0265, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.4251798093318939, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0269, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.4702431857585907, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0311, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.3235304355621338, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0236, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.4913889467716217, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0231, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.4980977177619934, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0289, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.740922212600708, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0334, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.3305300772190094, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0301, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.7037357091903687, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0311, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.44783756136894226, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0339, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.7776843309402466, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0349, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.49181437492370605, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0285, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.333814799785614, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0284, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 1.203652262687683, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0365, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.521643877029419, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0313, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.33309581875801086, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0265, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.48567256331443787, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0357, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8473871946334839, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0355, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.43827518820762634, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0266, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.5849157571792603, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0317, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.5690399408340454, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0266, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.6484784483909607, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0294, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.8894811272621155, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0239, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4575272798538208, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0323, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.4288756847381592, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.032, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.8871303200721741, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.5861580967903137, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0335, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.4159319996833801, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0247, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.6948496699333191, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0299, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.5089551210403442, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0333, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.6912631392478943, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0303, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0295, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.4634060561656952, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0261, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.5664045214653015, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0262, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.7963227033615112, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0278, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.45378491282463074, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0268, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.8970746994018555, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0271, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.5109472274780273, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0307, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.5023297667503357, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0263, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.6055631041526794, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0285, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.38602766394615173, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0282, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.5447302460670471, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0319, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.6613780856132507, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0271, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 1.0358555316925049, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.026, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.4463629722595215, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.5373798608779907, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.025, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.7735916972160339, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0325, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.5017692446708679, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0262, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.3406142592430115, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0271, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.28971537947654724, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0238, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.45441415905952454, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0261, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.4653581976890564, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.026, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.5449947714805603, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0314, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.41015395522117615, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0272, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.5936392545700073, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0269, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0256, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.6176534295082092, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0285, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.6774734258651733, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0268, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.7045454978942871, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0305, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.5905448794364929, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0284, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.7881343364715576, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0321, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.6635507941246033, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0284, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.46298888325691223, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0394, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.5187172889709473, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0257, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.5974661707878113, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0305, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.5171123743057251, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0275, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.35988888144493103, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0295, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.30543047189712524, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0334, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.6582810878753662, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0309, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.4986134171485901, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0294, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.5560855269432068, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0224, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.28974607586860657, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0313, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.24015791714191437, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.026, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.2704199552536011, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0244, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.6661707162857056, + "learning_rate": 1.068904422762975e-05, + "loss": 0.027, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.5058556795120239, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0254, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.7086800336837769, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0242, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.6752822399139404, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0262, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.8279762268066406, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0312, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.5070614814758301, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0308, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.3933897614479065, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0287, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.37238794565200806, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0325, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.7591347098350525, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0265, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.4841652810573578, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0331, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.45236295461654663, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0412, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.4774094820022583, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0289, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.47564345598220825, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0294, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.341337651014328, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0281, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.341701865196228, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0224, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.6621959209442139, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0283, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.348466694355011, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0234, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.35208311676979065, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0248, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.4973156154155731, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0246, + "step": 16000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.99422246780928e+16, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-16000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ff404721db24fa58b0de22d88e246e89d8573c18 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec67a7cdc93d102f1d5b779d57dcbc9e24ae332d87ee1cdf1826a4db28f4bf9 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..352d6aa390ef8eb51029a8ebf8d1aad7c82cd935 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14809826df3d34d78663fd751401bdb5451477d62b3ffd78c52d6c70ede64fa4 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ca55befb81980342aff7bf56e7293b042083065a --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3092ba4b1956497ccb6966839fa956e1598ed94e96d54487388f3222935ab75 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6f3f171191036b66d1c2163ffa620ce1e0b17afe --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/trainer_state.json @@ -0,0 +1,12634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0785547366528851, + "eval_steps": 500, + "global_step": 18000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.7277513146400452, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0304, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.6415050029754639, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0292, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.48691895604133606, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0337, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.53068608045578, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0338, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.5464624762535095, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0303, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.3911614418029785, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0345, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.6894099116325378, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0365, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.5268317461013794, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0405, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.8635499477386475, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0321, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21542859077453613, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0264, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.6257337331771851, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0355, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.6525475978851318, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0304, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.4599299430847168, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0314, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.7497361898422241, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.031, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.3124896287918091, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0257, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.6170748472213745, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0323, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.4619428515434265, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0315, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.5088011026382446, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0255, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.5397948622703552, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0265, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.457082062959671, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0279, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.4131294786930084, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0269, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 1.1949660778045654, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.6057063341140747, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0306, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.26918280124664307, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0283, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.48841091990470886, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0323, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.6195886135101318, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0295, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.5798623561859131, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.031, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.4877539277076721, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0267, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.33261221647262573, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0261, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.8361077904701233, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0311, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.305922269821167, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0302, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.22662357985973358, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.028, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.4273515045642853, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0307, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.521216869354248, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0277, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.7090896368026733, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0346, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.3693661391735077, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.3651321530342102, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0263, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.5577923655509949, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0357, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.6504148840904236, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0404, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.49205282330513, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.035, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.6053458452224731, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0328, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.5949649214744568, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0302, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.5310356020927429, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0264, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4087911546230316, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.35929426550865173, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0274, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.5112904906272888, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0253, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.39148232340812683, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0305, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.47718697786331177, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.620936393737793, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0289, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.8953443169593811, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0328, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.4663226902484894, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0302, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.707167387008667, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0319, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.5325813889503479, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0318, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.6239158511161804, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0289, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.38823947310447693, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0266, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.48849165439605713, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0234, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.23214028775691986, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0276, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3467197120189667, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0282, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.2009357064962387, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0298, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.8589951395988464, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0264, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.43969056010246277, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0292, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.5750611424446106, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0289, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.5399556756019592, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0307, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.20517395436763763, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0249, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.7490189671516418, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0246, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.6661257743835449, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0325, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.571394681930542, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0342, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.8792482018470764, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0332, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.5770248770713806, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0286, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.62962406873703, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0246, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.4651380479335785, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.037, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.5087499022483826, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0265, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.44421979784965515, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0306, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.6521517038345337, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0334, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.5384942889213562, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0296, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.41909387707710266, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0297, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.6697047352790833, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0331, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.4015032947063446, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0326, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.48070228099823, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0278, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.8651071786880493, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0242, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 1.17703378200531, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0288, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.45865103602409363, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0322, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.41243845224380493, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0297, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.482997864484787, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0305, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.5319142937660217, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.6116752028465271, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0311, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.4214901328086853, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0269, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.6246733069419861, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.026, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.4263368248939514, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0305, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.4059041738510132, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.022, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.6362516283988953, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0265, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.2905973494052887, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0297, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.42270833253860474, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0255, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.26410749554634094, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0252, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.7570974230766296, + "learning_rate": 1.153689339251154e-05, + "loss": 0.027, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.5941224098205566, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.3985750079154968, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0337, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.3877560496330261, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.024, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.44742006063461304, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0284, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.3280893564224243, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0318, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0341, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.4976208806037903, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0239, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.6153465509414673, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0252, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.6112402677536011, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.4973732531070709, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0307, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.5871816277503967, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0254, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 1.2150986194610596, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.6406526565551758, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0265, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.4251798093318939, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0269, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.4702431857585907, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0311, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.3235304355621338, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0236, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.4913889467716217, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0231, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.4980977177619934, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0289, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.740922212600708, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0334, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.3305300772190094, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0301, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.7037357091903687, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0311, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.44783756136894226, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0339, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.7776843309402466, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0349, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.49181437492370605, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0285, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.333814799785614, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0284, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 1.203652262687683, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0365, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.521643877029419, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0313, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.33309581875801086, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0265, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.48567256331443787, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0357, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8473871946334839, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0355, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.43827518820762634, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0266, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.5849157571792603, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0317, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.5690399408340454, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0266, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.6484784483909607, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0294, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.8894811272621155, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0239, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4575272798538208, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0323, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.4288756847381592, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.032, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.8871303200721741, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.5861580967903137, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0335, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.4159319996833801, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0247, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.6948496699333191, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0299, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.5089551210403442, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0333, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.6912631392478943, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0303, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0295, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.4634060561656952, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0261, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.5664045214653015, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0262, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.7963227033615112, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0278, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.45378491282463074, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0268, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.8970746994018555, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0271, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.5109472274780273, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0307, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.5023297667503357, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0263, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.6055631041526794, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0285, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.38602766394615173, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0282, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.5447302460670471, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0319, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.6613780856132507, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0271, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 1.0358555316925049, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.026, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.4463629722595215, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.5373798608779907, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.025, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.7735916972160339, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0325, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.5017692446708679, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0262, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.3406142592430115, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0271, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.28971537947654724, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0238, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.45441415905952454, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0261, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.4653581976890564, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.026, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.5449947714805603, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0314, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.41015395522117615, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0272, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.5936392545700073, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0269, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0256, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.6176534295082092, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0285, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.6774734258651733, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0268, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.7045454978942871, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0305, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.5905448794364929, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0284, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.7881343364715576, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0321, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.6635507941246033, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0284, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.46298888325691223, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0394, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.5187172889709473, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0257, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.5974661707878113, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0305, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.5171123743057251, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0275, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.35988888144493103, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0295, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.30543047189712524, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0334, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.6582810878753662, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0309, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.4986134171485901, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0294, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.5560855269432068, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0224, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.28974607586860657, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0313, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.24015791714191437, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.026, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.2704199552536011, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0244, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.6661707162857056, + "learning_rate": 1.068904422762975e-05, + "loss": 0.027, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.5058556795120239, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0254, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.7086800336837769, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0242, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.6752822399139404, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0262, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.8279762268066406, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0312, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.5070614814758301, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0308, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.3933897614479065, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0287, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.37238794565200806, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0325, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.7591347098350525, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0265, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.4841652810573578, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0331, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.45236295461654663, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0412, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.4774094820022583, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0289, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.47564345598220825, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0294, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.341337651014328, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0281, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.341701865196228, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0224, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.6621959209442139, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0283, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.348466694355011, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0234, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.35208311676979065, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0248, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.4973156154155731, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0246, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.3668982982635498, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0228, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.4771873950958252, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0303, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.3595021665096283, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0265, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.6013099551200867, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0297, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.40996676683425903, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0321, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.45742037892341614, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0288, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.8092222213745117, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0278, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.32741186022758484, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0288, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.5716732740402222, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0256, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.3263239562511444, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0271, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.35390567779541016, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0266, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.36520150303840637, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0265, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.46227532625198364, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.40079647302627563, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0327, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.3689155578613281, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0249, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.49527907371520996, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.029, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.38931334018707275, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0233, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.5698918700218201, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0269, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 1.0959579944610596, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.029, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.6321646571159363, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0276, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.7166606783866882, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0292, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.6464444994926453, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0246, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.7318128347396851, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0296, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.4828032851219177, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0247, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.4509548842906952, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0241, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.413630872964859, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0313, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.42443349957466125, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0316, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.8199112415313721, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0389, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.28918105363845825, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0242, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.6759344339370728, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0308, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.5480250120162964, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.025, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.48897549510002136, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.027, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.6111220121383667, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0276, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.8852546215057373, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0251, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.5098162889480591, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.022, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.45974940061569214, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0206, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3925095200538635, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0251, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.5461363792419434, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0217, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.5685333609580994, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0231, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.494150310754776, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0243, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.8770614862442017, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0286, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.27142134308815, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0253, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.3365682363510132, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0241, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.5512370467185974, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0242, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.5581703782081604, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0276, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.306773841381073, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0262, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.44620928168296814, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0229, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.5870804786682129, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0228, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.26162099838256836, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0278, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.27250319719314575, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0293, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.8330137729644775, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0315, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.5206989645957947, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0282, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.5408382415771484, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0359, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.30517199635505676, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0267, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.5315027236938477, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0206, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.46061626076698303, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0222, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.47393080592155457, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0262, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.3686772882938385, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0254, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.3312757611274719, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0243, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.565447986125946, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0267, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.5690101385116577, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0237, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.44088438153266907, + "learning_rate": 9.911670744652783e-06, + "loss": 0.028, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.3708919882774353, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0265, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.589698851108551, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0297, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.6541375517845154, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0288, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.5304558873176575, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0243, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.5774737000465393, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0277, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.5616280436515808, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0267, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.6129759550094604, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0223, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.45278221368789673, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0304, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.44487202167510986, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0296, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.5391712188720703, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0256, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.43523359298706055, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0277, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.5308435559272766, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0242, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.3361283540725708, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0236, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.3764631450176239, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0304, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.9003425240516663, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0278, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.2787775993347168, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0219, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.40089285373687744, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0284, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.3619711101055145, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0252, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.7354542016983032, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0242, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.3854006826877594, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0302, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.3318389058113098, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0265, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.5286651849746704, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0235, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.24921932816505432, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0259, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.7376067042350769, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0238, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.35099226236343384, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0257, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.3805389702320099, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0198, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.4433703124523163, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0241, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.3667793571949005, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0268, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.2963331639766693, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0223, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.9817414879798889, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0248, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.6529688835144043, + "learning_rate": 9.612315882780393e-06, + "loss": 0.032, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.7663154602050781, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0267, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.6086964011192322, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0281, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.5240464806556702, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0339, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.6558368802070618, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0284, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.6192268133163452, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0309, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.5293763875961304, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0257, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.38831329345703125, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0239, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.12827467918396, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0323, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.411818265914917, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0274, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.5521355867385864, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0233, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.26673075556755066, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0317, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.5205486416816711, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0273, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.8010990619659424, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0292, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.420612633228302, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0274, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.4811270236968994, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0277, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.4959382712841034, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0288, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.4607725739479065, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0245, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.9101414680480957, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0283, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.38626620173454285, + "learning_rate": 9.42959233811777e-06, + "loss": 0.026, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.5709372758865356, + "learning_rate": 9.419993062475743e-06, + "loss": 0.021, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.4417913854122162, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0291, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.5651213526725769, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0228, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.4716165363788605, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0242, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.9120892286300659, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0296, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.5004292130470276, + "learning_rate": 9.372024722887089e-06, + "loss": 0.033, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.3422714173793793, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0284, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.5391610264778137, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0362, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.5446203351020813, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0247, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.5441875457763672, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0284, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.48274070024490356, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0245, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.6035326719284058, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0226, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.3104001581668854, + "learning_rate": 9.304949604077693e-06, + "loss": 0.029, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.27859869599342346, + "learning_rate": 9.295375311262483e-06, + "loss": 0.022, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.3896406292915344, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0235, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.4526473581790924, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0289, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.6624506115913391, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0265, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.6976125836372375, + "learning_rate": 9.257098257046206e-06, + "loss": 0.029, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.5974310040473938, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0205, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.7627739906311035, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0333, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.3166525065898895, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0309, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.41519322991371155, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0223, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.31840237975120544, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0239, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.47412827610969543, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0228, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.41170552372932434, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0209, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.45858854055404663, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0243, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.7870534658432007, + "learning_rate": 9.171095634265995e-06, + "loss": 0.027, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.4080354869365692, + "learning_rate": 9.161550369445782e-06, + "loss": 0.023, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.47916823625564575, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0303, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.6911760568618774, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0263, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.3980148732662201, + "learning_rate": 9.132927564918328e-06, + "loss": 0.028, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.47085851430892944, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0266, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.5085862874984741, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0239, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.5219245553016663, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0267, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.5199264287948608, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0277, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.6157195568084717, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0343, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.5366696715354919, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0271, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.3640076220035553, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0258, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.5320505499839783, + "learning_rate": 9.05669731553499e-06, + "loss": 0.024, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.507826566696167, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0253, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.741392195224762, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0242, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.5325136184692383, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0224, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.4709665775299072, + "learning_rate": 9.018636566864313e-06, + "loss": 0.026, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.4371986985206604, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0264, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.47594818472862244, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0224, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.488423228263855, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0261, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.24745763838291168, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0206, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.5042629837989807, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0305, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.5255836844444275, + "learning_rate": 8.961615424107555e-06, + "loss": 0.026, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.4605107307434082, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0274, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.3252561390399933, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0227, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.35779184103012085, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0296, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.2960403263568878, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0212, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.6344659328460693, + "learning_rate": 8.914163487132906e-06, + "loss": 0.026, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.4614463150501251, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0234, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4490053951740265, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0265, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.5291271209716797, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0326, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.5311887264251709, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0257, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.5647584199905396, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0295, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.3913862705230713, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0256, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.4476219415664673, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0248, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.7807655930519104, + "learning_rate": 8.83836825410936e-06, + "loss": 0.026, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.38984328508377075, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0247, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.5757346153259277, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0296, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.25636178255081177, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0222, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.45617344975471497, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0224, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.3066493272781372, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0237, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.26513972878456116, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0277, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.445230633020401, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0248, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.4914413392543793, + "learning_rate": 8.762735374981932e-06, + "loss": 0.022, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.41469570994377136, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0245, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.33235347270965576, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0229, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.4890037775039673, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0247, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.41330578923225403, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0285, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.6309427618980408, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0233, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.42090296745300293, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0254, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.5888519287109375, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0262, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.5488774180412292, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0262, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.48015111684799194, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0219, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.4484168291091919, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0276, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.4128018319606781, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0218, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.5151517987251282, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0242, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.6248350143432617, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0267, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.4116908013820648, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0242, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.6138579249382019, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0282, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.22843605279922485, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0284, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.49555841088294983, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0244, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.5752411484718323, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0275, + "step": 18000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1243266036570522e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-18000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5192576472042a90ea17c3138f118dcdd65ef7bf --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a565d2ee46e78dcd51183aab9840e26e1168d9aabbec5caba40ad9324abe73e5 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c700bf1f52d3b3dbeb154f68fdb2da0da6ab3ba0 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2435cbc40a01a7a4ceefff705b16f5b48d8bd438eda74d1cc000870beb2c2f91 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..57c04c26b5e58507fd76fc235539b5d0d9ddac16 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25206846617b2c8161ad3c4e21aa90dd9949028b61af18620dc64116dc642b45 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dbe86977af2b6f6abb68cc2343b0f3f610cedaba --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/trainer_state.json @@ -0,0 +1,14034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.198394151836539, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.7277513146400452, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0304, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.6415050029754639, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0292, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.48691895604133606, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0337, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.53068608045578, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0338, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.5464624762535095, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0303, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.3911614418029785, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0345, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.6894099116325378, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0365, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.5268317461013794, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0405, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.8635499477386475, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0321, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21542859077453613, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0264, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.6257337331771851, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0355, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.6525475978851318, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0304, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.4599299430847168, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0314, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.7497361898422241, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.031, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.3124896287918091, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0257, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.6170748472213745, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0323, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.4619428515434265, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0315, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.5088011026382446, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0255, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.5397948622703552, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0265, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.457082062959671, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0279, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.4131294786930084, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0269, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 1.1949660778045654, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.6057063341140747, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0306, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.26918280124664307, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0283, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.48841091990470886, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0323, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.6195886135101318, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0295, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.5798623561859131, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.031, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.4877539277076721, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0267, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.33261221647262573, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0261, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.8361077904701233, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0311, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.305922269821167, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0302, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.22662357985973358, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.028, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.4273515045642853, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0307, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.521216869354248, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0277, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.7090896368026733, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0346, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.3693661391735077, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.3651321530342102, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0263, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.5577923655509949, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0357, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.6504148840904236, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0404, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.49205282330513, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.035, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.6053458452224731, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0328, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.5949649214744568, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0302, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.5310356020927429, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0264, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4087911546230316, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.35929426550865173, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0274, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.5112904906272888, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0253, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.39148232340812683, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0305, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.47718697786331177, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.620936393737793, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0289, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.8953443169593811, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0328, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.4663226902484894, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0302, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.707167387008667, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0319, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.5325813889503479, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0318, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.6239158511161804, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0289, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.38823947310447693, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0266, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.48849165439605713, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0234, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.23214028775691986, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0276, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3467197120189667, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0282, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.2009357064962387, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0298, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.8589951395988464, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0264, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.43969056010246277, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0292, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.5750611424446106, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0289, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.5399556756019592, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0307, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.20517395436763763, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0249, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.7490189671516418, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0246, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.6661257743835449, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0325, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.571394681930542, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0342, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.8792482018470764, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0332, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.5770248770713806, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0286, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.62962406873703, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0246, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.4651380479335785, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.037, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.5087499022483826, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0265, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.44421979784965515, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0306, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.6521517038345337, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0334, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.5384942889213562, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0296, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.41909387707710266, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0297, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.6697047352790833, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0331, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.4015032947063446, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0326, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.48070228099823, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0278, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.8651071786880493, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0242, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 1.17703378200531, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0288, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.45865103602409363, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0322, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.41243845224380493, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0297, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.482997864484787, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0305, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.5319142937660217, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.6116752028465271, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0311, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.4214901328086853, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0269, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.6246733069419861, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.026, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.4263368248939514, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0305, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.4059041738510132, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.022, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.6362516283988953, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0265, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.2905973494052887, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0297, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.42270833253860474, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0255, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.26410749554634094, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0252, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.7570974230766296, + "learning_rate": 1.153689339251154e-05, + "loss": 0.027, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.5941224098205566, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.3985750079154968, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0337, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.3877560496330261, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.024, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.44742006063461304, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0284, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.3280893564224243, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0318, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0341, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.4976208806037903, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0239, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.6153465509414673, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0252, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.6112402677536011, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.4973732531070709, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0307, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.5871816277503967, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0254, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 1.2150986194610596, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.6406526565551758, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0265, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.4251798093318939, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0269, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.4702431857585907, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0311, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.3235304355621338, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0236, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.4913889467716217, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0231, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.4980977177619934, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0289, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.740922212600708, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0334, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.3305300772190094, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0301, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.7037357091903687, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0311, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.44783756136894226, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0339, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.7776843309402466, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0349, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.49181437492370605, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0285, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.333814799785614, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0284, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 1.203652262687683, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0365, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.521643877029419, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0313, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.33309581875801086, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0265, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.48567256331443787, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0357, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8473871946334839, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0355, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.43827518820762634, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0266, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.5849157571792603, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0317, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.5690399408340454, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0266, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.6484784483909607, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0294, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.8894811272621155, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0239, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4575272798538208, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0323, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.4288756847381592, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.032, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.8871303200721741, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.5861580967903137, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0335, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.4159319996833801, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0247, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.6948496699333191, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0299, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.5089551210403442, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0333, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.6912631392478943, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0303, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0295, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.4634060561656952, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0261, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.5664045214653015, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0262, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.7963227033615112, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0278, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.45378491282463074, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0268, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.8970746994018555, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0271, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.5109472274780273, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0307, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.5023297667503357, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0263, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.6055631041526794, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0285, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.38602766394615173, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0282, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.5447302460670471, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0319, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.6613780856132507, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0271, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 1.0358555316925049, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.026, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.4463629722595215, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.5373798608779907, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.025, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.7735916972160339, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0325, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.5017692446708679, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0262, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.3406142592430115, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0271, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.28971537947654724, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0238, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.45441415905952454, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0261, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.4653581976890564, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.026, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.5449947714805603, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0314, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.41015395522117615, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0272, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.5936392545700073, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0269, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0256, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.6176534295082092, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0285, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.6774734258651733, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0268, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.7045454978942871, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0305, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.5905448794364929, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0284, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.7881343364715576, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0321, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.6635507941246033, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0284, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.46298888325691223, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0394, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.5187172889709473, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0257, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.5974661707878113, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0305, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.5171123743057251, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0275, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.35988888144493103, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0295, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.30543047189712524, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0334, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.6582810878753662, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0309, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.4986134171485901, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0294, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.5560855269432068, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0224, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.28974607586860657, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0313, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.24015791714191437, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.026, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.2704199552536011, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0244, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.6661707162857056, + "learning_rate": 1.068904422762975e-05, + "loss": 0.027, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.5058556795120239, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0254, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.7086800336837769, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0242, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.6752822399139404, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0262, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.8279762268066406, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0312, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.5070614814758301, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0308, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.3933897614479065, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0287, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.37238794565200806, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0325, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.7591347098350525, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0265, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.4841652810573578, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0331, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.45236295461654663, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0412, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.4774094820022583, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0289, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.47564345598220825, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0294, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.341337651014328, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0281, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.341701865196228, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0224, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.6621959209442139, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0283, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.348466694355011, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0234, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.35208311676979065, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0248, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.4973156154155731, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0246, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.3668982982635498, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0228, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.4771873950958252, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0303, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.3595021665096283, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0265, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.6013099551200867, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0297, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.40996676683425903, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0321, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.45742037892341614, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0288, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.8092222213745117, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0278, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.32741186022758484, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0288, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.5716732740402222, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0256, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.3263239562511444, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0271, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.35390567779541016, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0266, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.36520150303840637, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0265, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.46227532625198364, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.40079647302627563, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0327, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.3689155578613281, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0249, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.49527907371520996, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.029, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.38931334018707275, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0233, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.5698918700218201, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0269, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 1.0959579944610596, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.029, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.6321646571159363, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0276, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.7166606783866882, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0292, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.6464444994926453, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0246, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.7318128347396851, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0296, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.4828032851219177, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0247, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.4509548842906952, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0241, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.413630872964859, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0313, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.42443349957466125, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0316, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.8199112415313721, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0389, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.28918105363845825, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0242, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.6759344339370728, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0308, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.5480250120162964, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.025, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.48897549510002136, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.027, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.6111220121383667, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0276, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.8852546215057373, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0251, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.5098162889480591, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.022, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.45974940061569214, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0206, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3925095200538635, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0251, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.5461363792419434, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0217, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.5685333609580994, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0231, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.494150310754776, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0243, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.8770614862442017, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0286, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.27142134308815, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0253, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.3365682363510132, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0241, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.5512370467185974, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0242, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.5581703782081604, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0276, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.306773841381073, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0262, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.44620928168296814, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0229, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.5870804786682129, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0228, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.26162099838256836, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0278, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.27250319719314575, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0293, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.8330137729644775, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0315, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.5206989645957947, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0282, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.5408382415771484, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0359, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.30517199635505676, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0267, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.5315027236938477, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0206, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.46061626076698303, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0222, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.47393080592155457, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0262, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.3686772882938385, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0254, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.3312757611274719, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0243, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.565447986125946, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0267, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.5690101385116577, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0237, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.44088438153266907, + "learning_rate": 9.911670744652783e-06, + "loss": 0.028, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.3708919882774353, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0265, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.589698851108551, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0297, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.6541375517845154, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0288, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.5304558873176575, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0243, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.5774737000465393, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0277, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.5616280436515808, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0267, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.6129759550094604, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0223, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.45278221368789673, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0304, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.44487202167510986, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0296, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.5391712188720703, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0256, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.43523359298706055, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0277, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.5308435559272766, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0242, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.3361283540725708, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0236, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.3764631450176239, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0304, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.9003425240516663, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0278, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.2787775993347168, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0219, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.40089285373687744, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0284, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.3619711101055145, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0252, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.7354542016983032, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0242, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.3854006826877594, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0302, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.3318389058113098, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0265, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.5286651849746704, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0235, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.24921932816505432, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0259, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.7376067042350769, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0238, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.35099226236343384, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0257, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.3805389702320099, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0198, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.4433703124523163, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0241, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.3667793571949005, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0268, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.2963331639766693, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0223, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.9817414879798889, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0248, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.6529688835144043, + "learning_rate": 9.612315882780393e-06, + "loss": 0.032, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.7663154602050781, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0267, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.6086964011192322, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0281, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.5240464806556702, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0339, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.6558368802070618, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0284, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.6192268133163452, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0309, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.5293763875961304, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0257, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.38831329345703125, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0239, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.12827467918396, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0323, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.411818265914917, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0274, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.5521355867385864, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0233, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.26673075556755066, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0317, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.5205486416816711, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0273, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.8010990619659424, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0292, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.420612633228302, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0274, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.4811270236968994, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0277, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.4959382712841034, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0288, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.4607725739479065, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0245, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.9101414680480957, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0283, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.38626620173454285, + "learning_rate": 9.42959233811777e-06, + "loss": 0.026, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.5709372758865356, + "learning_rate": 9.419993062475743e-06, + "loss": 0.021, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.4417913854122162, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0291, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.5651213526725769, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0228, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.4716165363788605, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0242, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.9120892286300659, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0296, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.5004292130470276, + "learning_rate": 9.372024722887089e-06, + "loss": 0.033, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.3422714173793793, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0284, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.5391610264778137, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0362, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.5446203351020813, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0247, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.5441875457763672, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0284, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.48274070024490356, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0245, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.6035326719284058, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0226, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.3104001581668854, + "learning_rate": 9.304949604077693e-06, + "loss": 0.029, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.27859869599342346, + "learning_rate": 9.295375311262483e-06, + "loss": 0.022, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.3896406292915344, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0235, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.4526473581790924, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0289, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.6624506115913391, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0265, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.6976125836372375, + "learning_rate": 9.257098257046206e-06, + "loss": 0.029, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.5974310040473938, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0205, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.7627739906311035, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0333, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.3166525065898895, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0309, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.41519322991371155, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0223, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.31840237975120544, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0239, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.47412827610969543, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0228, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.41170552372932434, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0209, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.45858854055404663, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0243, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.7870534658432007, + "learning_rate": 9.171095634265995e-06, + "loss": 0.027, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.4080354869365692, + "learning_rate": 9.161550369445782e-06, + "loss": 0.023, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.47916823625564575, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0303, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.6911760568618774, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0263, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.3980148732662201, + "learning_rate": 9.132927564918328e-06, + "loss": 0.028, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.47085851430892944, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0266, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.5085862874984741, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0239, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.5219245553016663, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0267, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.5199264287948608, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0277, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.6157195568084717, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0343, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.5366696715354919, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0271, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.3640076220035553, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0258, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.5320505499839783, + "learning_rate": 9.05669731553499e-06, + "loss": 0.024, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.507826566696167, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0253, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.741392195224762, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0242, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.5325136184692383, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0224, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.4709665775299072, + "learning_rate": 9.018636566864313e-06, + "loss": 0.026, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.4371986985206604, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0264, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.47594818472862244, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0224, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.488423228263855, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0261, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.24745763838291168, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0206, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.5042629837989807, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0305, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.5255836844444275, + "learning_rate": 8.961615424107555e-06, + "loss": 0.026, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.4605107307434082, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0274, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.3252561390399933, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0227, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.35779184103012085, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0296, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.2960403263568878, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0212, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.6344659328460693, + "learning_rate": 8.914163487132906e-06, + "loss": 0.026, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.4614463150501251, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0234, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4490053951740265, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0265, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.5291271209716797, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0326, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.5311887264251709, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0257, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.5647584199905396, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0295, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.3913862705230713, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0256, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.4476219415664673, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0248, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.7807655930519104, + "learning_rate": 8.83836825410936e-06, + "loss": 0.026, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.38984328508377075, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0247, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.5757346153259277, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0296, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.25636178255081177, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0222, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.45617344975471497, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0224, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.3066493272781372, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0237, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.26513972878456116, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0277, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.445230633020401, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0248, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.4914413392543793, + "learning_rate": 8.762735374981932e-06, + "loss": 0.022, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.41469570994377136, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0245, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.33235347270965576, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0229, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.4890037775039673, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0247, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.41330578923225403, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0285, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.6309427618980408, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0233, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.42090296745300293, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0254, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.5888519287109375, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0262, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.5488774180412292, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0262, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.48015111684799194, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0219, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.4484168291091919, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0276, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.4128018319606781, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0218, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.5151517987251282, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0242, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.6248350143432617, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0267, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.4116908013820648, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0242, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.6138579249382019, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0282, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.22843605279922485, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0284, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.49555841088294983, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0244, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.5752411484718323, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0275, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.5129706859588623, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0237, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.751230001449585, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0257, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.47749435901641846, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0277, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21702095866203308, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0255, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.30658838152885437, + "learning_rate": 8.54624657467318e-06, + "loss": 0.024, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.3589625954627991, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0215, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.5434426069259644, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0224, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.8732438683509827, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0289, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.34988290071487427, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0226, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.4021032154560089, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0248, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.4676196873188019, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0235, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.41646474599838257, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0235, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.5892519950866699, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0221, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.5757095217704773, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0258, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.4664652645587921, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0275, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.4674879014492035, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0285, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.7277936339378357, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0316, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.40373867750167847, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0213, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.8632686138153076, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0239, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.5620945692062378, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0259, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3430384695529938, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0287, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.46981969475746155, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0218, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.3494231700897217, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0238, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.514975368976593, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0205, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.6442168951034546, + "learning_rate": 8.359228888944986e-06, + "loss": 0.021, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.32178881764411926, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0219, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.48865941166877747, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0261, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.6131434440612793, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0269, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.4471806585788727, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0251, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.8255780935287476, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0229, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.843673586845398, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0278, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.4278610348701477, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0228, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.5036011338233948, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0291, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.5141382813453674, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0217, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.8976346850395203, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0248, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.5634751319885254, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0276, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.5327013731002808, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0279, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.2723959982395172, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0225, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.4455258846282959, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0222, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.3784103989601135, + "learning_rate": 8.219774325200873e-06, + "loss": 0.024, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.8102694749832153, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0231, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.5179240703582764, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0255, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.39830490946769714, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0264, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.32860279083251953, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0241, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.5459582209587097, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0193, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.3841477036476135, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0282, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.7849119305610657, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0319, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.4457703232765198, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0279, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.30464428663253784, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0184, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 1.0635287761688232, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0265, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.33294421434402466, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0235, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.5644985437393188, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0218, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.4975566565990448, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0261, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.7503839135169983, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0218, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.35363277792930603, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0198, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.43968406319618225, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0253, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.4553394615650177, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0266, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.45489153265953064, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0264, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.424696147441864, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0209, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.4819740653038025, + "learning_rate": 8.03498318084394e-06, + "loss": 0.022, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.564834475517273, + "learning_rate": 8.025779439806006e-06, + "loss": 0.024, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.7905157804489136, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0261, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.6985124349594116, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0315, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.42378291487693787, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0237, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.5980759263038635, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0217, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.45916232466697693, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0235, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.25486481189727783, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0231, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.4072360694408417, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0261, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3813820481300354, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0209, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.3040210008621216, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0225, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.30910906195640564, + "learning_rate": 7.933935782312965e-06, + "loss": 0.026, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.6573566794395447, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0262, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.30632153153419495, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0251, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.3277539610862732, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0233, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.49589917063713074, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0211, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.4149130880832672, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0203, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.7051926851272583, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0272, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.8553881049156189, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0236, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.5676615238189697, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0242, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.29548707604408264, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0236, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.36076608300209045, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0219, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.3657922148704529, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0227, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.27593615651130676, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0251, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.35554730892181396, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0259, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.45652297139167786, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0274, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.5757999420166016, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0222, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.5138059854507446, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0216, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.338874876499176, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0232, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.48215195536613464, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0226, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.30239933729171753, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0205, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.6099343299865723, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0219, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.6730902791023254, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0239, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.4575020968914032, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0204, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.2673267424106598, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0222, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.3593531548976898, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0225, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.5385488867759705, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0248, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.3900541663169861, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0277, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.6182276010513306, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0241, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.4897976815700531, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0229, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.5717247128486633, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0273, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.4837515950202942, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0219, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.31954509019851685, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0271, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.23005163669586182, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0204, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.500217616558075, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0229, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.47326523065567017, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0203, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.5074726939201355, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0249, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.6583673357963562, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0243, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.7585731744766235, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0264, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.3782348036766052, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0216, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.43963512778282166, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0201, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.6450467109680176, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0254, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.3420482575893402, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0224, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.3532888889312744, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0216, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.32494598627090454, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0196, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.2898419499397278, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0234, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.4379838705062866, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0233, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.5390518307685852, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0169, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.3786150813102722, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0203, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.3376149833202362, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0266, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.40810349583625793, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0241, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.24485738575458527, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0199, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.4670563340187073, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0184, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.374255508184433, + "learning_rate": 7.4623904967312e-06, + "loss": 0.018, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.4191536605358124, + "learning_rate": 7.453427567620127e-06, + "loss": 0.022, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3807078003883362, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0232, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.7537381649017334, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0202, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.36507129669189453, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0236, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.24461498856544495, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0221, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.351654589176178, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0236, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.35627686977386475, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0213, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.4586603343486786, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0304, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.4082098603248596, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0237, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.47707459330558777, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0247, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.4687316119670868, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0344, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.4660017788410187, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0214, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.4644101560115814, + "learning_rate": 7.346200065486093e-06, + "loss": 0.022, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.3139079213142395, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0234, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.36445188522338867, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0262, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.6457782983779907, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0261, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.4184044599533081, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0245, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.44356703758239746, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0215, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.5394402742385864, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0302, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 0.5960429906845093, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0234, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2850514352321625, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0243, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.45071718096733093, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0233, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.3157344162464142, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0254, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.45518410205841064, + "learning_rate": 7.248450164740439e-06, + "loss": 0.024, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.2323702722787857, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0226, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.6025039553642273, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0246, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.4983830749988556, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0199, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.3684524595737457, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0252, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.36924007534980774, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0277, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.3531496822834015, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0228, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.3995579779148102, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0193, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.4124946892261505, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0221, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.3897329866886139, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0221, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.45230787992477417, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0238, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.45878538489341736, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0244, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.4302407503128052, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0237, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.30422642827033997, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0173, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.49566513299942017, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0201, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.43262094259262085, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0227, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.8250450491905212, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0259, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.3265332281589508, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0205, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.2871774435043335, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0201, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.4341558814048767, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0199, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.43365293741226196, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0201, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.5876246690750122, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0205, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.2719171643257141, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0211, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.38791123032569885, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0244, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.4082484543323517, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0206, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.5010205507278442, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0245, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.4404369294643402, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0268, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.5171347856521606, + "learning_rate": 7.010805483338283e-06, + "loss": 0.024, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.5137951970100403, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0241, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.563709557056427, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0193, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.44687238335609436, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0207, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.33815798163414, + "learning_rate": 6.975884226362e-06, + "loss": 0.0246, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.33789384365081787, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0206, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.38053908944129944, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0195, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.5730066299438477, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0199, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.42453598976135254, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0218, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.48010921478271484, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0328, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.5227254629135132, + "learning_rate": 6.923644220932124e-06, + "loss": 0.019, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4078599810600281, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0212, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.4473094046115875, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0281, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.3459968864917755, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0231, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.4205886721611023, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0256, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.5397320985794067, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0214, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.6208626627922058, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0224, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.34377506375312805, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0197, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.4086950123310089, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0202, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.5211176872253418, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0201, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3705415725708008, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0219, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.32692769169807434, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0204, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.42599135637283325, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0213, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.565449595451355, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0223, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.4027825593948364, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0233, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.4833034574985504, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0309, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.5570312738418579, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0213, + "step": 20000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2492543845046682e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-20000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..376e3ce134dc60afef2246789621ae93a8bec554 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b12ac0f772ad3855767af4b5548b37ea3ac933266caadd2ded3bb8c232ea354 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4b79a9a2c1a9803ade2a8e9e11e742127c99f289 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0422d951ba0ff1a1b5dbbacac3e6a4a2819966cefc282b185b1b0d2d98eed16c +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..89f0e020a14575224563a78ee1586c5a62fb3f4f --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cab0fea1d6ed6d321fa693779cbfa44724b550ab5861d3f4ab0671f337059ef0 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d63a3605b4005c36d1942bcd2dd764720a9bffed --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/trainer_state.json @@ -0,0 +1,15434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.318233567020193, + "eval_steps": 500, + "global_step": 22000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.7277513146400452, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0304, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.6415050029754639, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0292, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.48691895604133606, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0337, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.53068608045578, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0338, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.5464624762535095, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0303, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.3911614418029785, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0345, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.6894099116325378, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0365, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.5268317461013794, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0405, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.8635499477386475, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0321, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21542859077453613, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0264, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.6257337331771851, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0355, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.6525475978851318, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0304, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.4599299430847168, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0314, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.7497361898422241, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.031, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.3124896287918091, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0257, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.6170748472213745, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0323, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.4619428515434265, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0315, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.5088011026382446, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0255, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.5397948622703552, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0265, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.457082062959671, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0279, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.4131294786930084, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0269, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 1.1949660778045654, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.6057063341140747, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0306, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.26918280124664307, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0283, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.48841091990470886, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0323, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.6195886135101318, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0295, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.5798623561859131, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.031, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.4877539277076721, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0267, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.33261221647262573, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0261, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.8361077904701233, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0311, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.305922269821167, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0302, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.22662357985973358, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.028, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.4273515045642853, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0307, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.521216869354248, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0277, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.7090896368026733, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0346, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.3693661391735077, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.3651321530342102, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0263, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.5577923655509949, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0357, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.6504148840904236, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0404, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.49205282330513, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.035, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.6053458452224731, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0328, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.5949649214744568, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0302, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.5310356020927429, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0264, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4087911546230316, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.35929426550865173, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0274, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.5112904906272888, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0253, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.39148232340812683, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0305, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.47718697786331177, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.620936393737793, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0289, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.8953443169593811, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0328, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.4663226902484894, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0302, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.707167387008667, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0319, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.5325813889503479, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0318, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.6239158511161804, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0289, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.38823947310447693, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0266, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.48849165439605713, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0234, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.23214028775691986, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0276, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3467197120189667, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0282, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.2009357064962387, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0298, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.8589951395988464, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0264, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.43969056010246277, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0292, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.5750611424446106, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0289, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.5399556756019592, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0307, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.20517395436763763, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0249, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.7490189671516418, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0246, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.6661257743835449, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0325, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.571394681930542, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0342, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.8792482018470764, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0332, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.5770248770713806, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0286, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.62962406873703, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0246, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.4651380479335785, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.037, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.5087499022483826, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0265, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.44421979784965515, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0306, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.6521517038345337, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0334, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.5384942889213562, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0296, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.41909387707710266, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0297, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.6697047352790833, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0331, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.4015032947063446, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0326, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.48070228099823, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0278, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.8651071786880493, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0242, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 1.17703378200531, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0288, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.45865103602409363, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0322, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.41243845224380493, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0297, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.482997864484787, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0305, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.5319142937660217, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.6116752028465271, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0311, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.4214901328086853, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0269, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.6246733069419861, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.026, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.4263368248939514, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0305, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.4059041738510132, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.022, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.6362516283988953, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0265, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.2905973494052887, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0297, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.42270833253860474, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0255, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.26410749554634094, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0252, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.7570974230766296, + "learning_rate": 1.153689339251154e-05, + "loss": 0.027, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.5941224098205566, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.3985750079154968, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0337, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.3877560496330261, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.024, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.44742006063461304, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0284, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.3280893564224243, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0318, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0341, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.4976208806037903, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0239, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.6153465509414673, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0252, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.6112402677536011, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.4973732531070709, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0307, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.5871816277503967, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0254, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 1.2150986194610596, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.6406526565551758, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0265, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.4251798093318939, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0269, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.4702431857585907, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0311, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.3235304355621338, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0236, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.4913889467716217, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0231, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.4980977177619934, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0289, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.740922212600708, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0334, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.3305300772190094, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0301, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.7037357091903687, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0311, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.44783756136894226, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0339, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.7776843309402466, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0349, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.49181437492370605, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0285, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.333814799785614, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0284, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 1.203652262687683, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0365, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.521643877029419, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0313, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.33309581875801086, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0265, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.48567256331443787, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0357, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8473871946334839, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0355, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.43827518820762634, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0266, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.5849157571792603, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0317, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.5690399408340454, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0266, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.6484784483909607, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0294, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.8894811272621155, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0239, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4575272798538208, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0323, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.4288756847381592, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.032, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.8871303200721741, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.5861580967903137, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0335, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.4159319996833801, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0247, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.6948496699333191, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0299, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.5089551210403442, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0333, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.6912631392478943, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0303, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0295, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.4634060561656952, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0261, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.5664045214653015, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0262, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.7963227033615112, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0278, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.45378491282463074, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0268, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.8970746994018555, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0271, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.5109472274780273, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0307, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.5023297667503357, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0263, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.6055631041526794, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0285, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.38602766394615173, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0282, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.5447302460670471, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0319, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.6613780856132507, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0271, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 1.0358555316925049, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.026, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.4463629722595215, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.5373798608779907, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.025, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.7735916972160339, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0325, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.5017692446708679, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0262, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.3406142592430115, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0271, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.28971537947654724, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0238, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.45441415905952454, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0261, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.4653581976890564, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.026, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.5449947714805603, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0314, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.41015395522117615, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0272, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.5936392545700073, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0269, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0256, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.6176534295082092, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0285, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.6774734258651733, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0268, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.7045454978942871, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0305, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.5905448794364929, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0284, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.7881343364715576, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0321, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.6635507941246033, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0284, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.46298888325691223, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0394, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.5187172889709473, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0257, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.5974661707878113, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0305, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.5171123743057251, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0275, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.35988888144493103, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0295, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.30543047189712524, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0334, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.6582810878753662, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0309, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.4986134171485901, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0294, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.5560855269432068, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0224, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.28974607586860657, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0313, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.24015791714191437, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.026, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.2704199552536011, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0244, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.6661707162857056, + "learning_rate": 1.068904422762975e-05, + "loss": 0.027, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.5058556795120239, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0254, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.7086800336837769, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0242, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.6752822399139404, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0262, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.8279762268066406, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0312, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.5070614814758301, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0308, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.3933897614479065, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0287, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.37238794565200806, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0325, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.7591347098350525, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0265, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.4841652810573578, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0331, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.45236295461654663, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0412, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.4774094820022583, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0289, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.47564345598220825, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0294, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.341337651014328, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0281, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.341701865196228, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0224, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.6621959209442139, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0283, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.348466694355011, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0234, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.35208311676979065, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0248, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.4973156154155731, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0246, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.3668982982635498, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0228, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.4771873950958252, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0303, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.3595021665096283, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0265, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.6013099551200867, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0297, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.40996676683425903, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0321, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.45742037892341614, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0288, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.8092222213745117, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0278, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.32741186022758484, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0288, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.5716732740402222, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0256, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.3263239562511444, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0271, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.35390567779541016, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0266, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.36520150303840637, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0265, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.46227532625198364, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.40079647302627563, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0327, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.3689155578613281, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0249, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.49527907371520996, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.029, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.38931334018707275, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0233, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.5698918700218201, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0269, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 1.0959579944610596, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.029, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.6321646571159363, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0276, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.7166606783866882, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0292, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.6464444994926453, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0246, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.7318128347396851, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0296, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.4828032851219177, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0247, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.4509548842906952, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0241, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.413630872964859, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0313, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.42443349957466125, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0316, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.8199112415313721, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0389, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.28918105363845825, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0242, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.6759344339370728, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0308, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.5480250120162964, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.025, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.48897549510002136, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.027, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.6111220121383667, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0276, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.8852546215057373, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0251, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.5098162889480591, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.022, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.45974940061569214, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0206, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3925095200538635, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0251, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.5461363792419434, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0217, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.5685333609580994, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0231, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.494150310754776, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0243, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.8770614862442017, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0286, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.27142134308815, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0253, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.3365682363510132, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0241, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.5512370467185974, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0242, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.5581703782081604, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0276, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.306773841381073, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0262, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.44620928168296814, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0229, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.5870804786682129, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0228, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.26162099838256836, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0278, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.27250319719314575, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0293, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.8330137729644775, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0315, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.5206989645957947, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0282, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.5408382415771484, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0359, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.30517199635505676, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0267, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.5315027236938477, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0206, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.46061626076698303, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0222, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.47393080592155457, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0262, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.3686772882938385, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0254, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.3312757611274719, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0243, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.565447986125946, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0267, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.5690101385116577, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0237, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.44088438153266907, + "learning_rate": 9.911670744652783e-06, + "loss": 0.028, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.3708919882774353, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0265, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.589698851108551, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0297, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.6541375517845154, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0288, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.5304558873176575, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0243, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.5774737000465393, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0277, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.5616280436515808, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0267, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.6129759550094604, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0223, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.45278221368789673, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0304, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.44487202167510986, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0296, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.5391712188720703, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0256, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.43523359298706055, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0277, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.5308435559272766, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0242, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.3361283540725708, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0236, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.3764631450176239, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0304, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.9003425240516663, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0278, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.2787775993347168, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0219, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.40089285373687744, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0284, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.3619711101055145, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0252, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.7354542016983032, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0242, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.3854006826877594, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0302, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.3318389058113098, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0265, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.5286651849746704, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0235, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.24921932816505432, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0259, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.7376067042350769, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0238, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.35099226236343384, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0257, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.3805389702320099, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0198, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.4433703124523163, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0241, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.3667793571949005, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0268, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.2963331639766693, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0223, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.9817414879798889, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0248, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.6529688835144043, + "learning_rate": 9.612315882780393e-06, + "loss": 0.032, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.7663154602050781, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0267, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.6086964011192322, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0281, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.5240464806556702, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0339, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.6558368802070618, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0284, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.6192268133163452, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0309, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.5293763875961304, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0257, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.38831329345703125, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0239, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.12827467918396, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0323, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.411818265914917, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0274, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.5521355867385864, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0233, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.26673075556755066, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0317, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.5205486416816711, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0273, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.8010990619659424, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0292, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.420612633228302, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0274, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.4811270236968994, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0277, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.4959382712841034, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0288, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.4607725739479065, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0245, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.9101414680480957, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0283, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.38626620173454285, + "learning_rate": 9.42959233811777e-06, + "loss": 0.026, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.5709372758865356, + "learning_rate": 9.419993062475743e-06, + "loss": 0.021, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.4417913854122162, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0291, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.5651213526725769, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0228, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.4716165363788605, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0242, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.9120892286300659, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0296, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.5004292130470276, + "learning_rate": 9.372024722887089e-06, + "loss": 0.033, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.3422714173793793, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0284, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.5391610264778137, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0362, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.5446203351020813, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0247, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.5441875457763672, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0284, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.48274070024490356, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0245, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.6035326719284058, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0226, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.3104001581668854, + "learning_rate": 9.304949604077693e-06, + "loss": 0.029, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.27859869599342346, + "learning_rate": 9.295375311262483e-06, + "loss": 0.022, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.3896406292915344, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0235, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.4526473581790924, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0289, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.6624506115913391, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0265, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.6976125836372375, + "learning_rate": 9.257098257046206e-06, + "loss": 0.029, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.5974310040473938, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0205, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.7627739906311035, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0333, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.3166525065898895, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0309, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.41519322991371155, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0223, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.31840237975120544, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0239, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.47412827610969543, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0228, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.41170552372932434, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0209, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.45858854055404663, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0243, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.7870534658432007, + "learning_rate": 9.171095634265995e-06, + "loss": 0.027, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.4080354869365692, + "learning_rate": 9.161550369445782e-06, + "loss": 0.023, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.47916823625564575, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0303, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.6911760568618774, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0263, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.3980148732662201, + "learning_rate": 9.132927564918328e-06, + "loss": 0.028, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.47085851430892944, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0266, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.5085862874984741, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0239, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.5219245553016663, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0267, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.5199264287948608, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0277, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.6157195568084717, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0343, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.5366696715354919, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0271, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.3640076220035553, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0258, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.5320505499839783, + "learning_rate": 9.05669731553499e-06, + "loss": 0.024, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.507826566696167, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0253, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.741392195224762, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0242, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.5325136184692383, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0224, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.4709665775299072, + "learning_rate": 9.018636566864313e-06, + "loss": 0.026, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.4371986985206604, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0264, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.47594818472862244, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0224, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.488423228263855, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0261, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.24745763838291168, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0206, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.5042629837989807, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0305, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.5255836844444275, + "learning_rate": 8.961615424107555e-06, + "loss": 0.026, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.4605107307434082, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0274, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.3252561390399933, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0227, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.35779184103012085, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0296, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.2960403263568878, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0212, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.6344659328460693, + "learning_rate": 8.914163487132906e-06, + "loss": 0.026, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.4614463150501251, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0234, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4490053951740265, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0265, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.5291271209716797, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0326, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.5311887264251709, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0257, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.5647584199905396, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0295, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.3913862705230713, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0256, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.4476219415664673, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0248, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.7807655930519104, + "learning_rate": 8.83836825410936e-06, + "loss": 0.026, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.38984328508377075, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0247, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.5757346153259277, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0296, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.25636178255081177, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0222, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.45617344975471497, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0224, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.3066493272781372, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0237, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.26513972878456116, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0277, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.445230633020401, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0248, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.4914413392543793, + "learning_rate": 8.762735374981932e-06, + "loss": 0.022, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.41469570994377136, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0245, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.33235347270965576, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0229, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.4890037775039673, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0247, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.41330578923225403, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0285, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.6309427618980408, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0233, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.42090296745300293, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0254, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.5888519287109375, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0262, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.5488774180412292, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0262, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.48015111684799194, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0219, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.4484168291091919, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0276, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.4128018319606781, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0218, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.5151517987251282, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0242, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.6248350143432617, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0267, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.4116908013820648, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0242, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.6138579249382019, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0282, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.22843605279922485, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0284, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.49555841088294983, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0244, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.5752411484718323, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0275, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.5129706859588623, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0237, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.751230001449585, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0257, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.47749435901641846, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0277, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21702095866203308, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0255, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.30658838152885437, + "learning_rate": 8.54624657467318e-06, + "loss": 0.024, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.3589625954627991, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0215, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.5434426069259644, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0224, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.8732438683509827, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0289, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.34988290071487427, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0226, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.4021032154560089, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0248, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.4676196873188019, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0235, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.41646474599838257, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0235, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.5892519950866699, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0221, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.5757095217704773, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0258, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.4664652645587921, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0275, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.4674879014492035, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0285, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.7277936339378357, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0316, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.40373867750167847, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0213, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.8632686138153076, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0239, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.5620945692062378, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0259, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3430384695529938, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0287, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.46981969475746155, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0218, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.3494231700897217, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0238, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.514975368976593, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0205, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.6442168951034546, + "learning_rate": 8.359228888944986e-06, + "loss": 0.021, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.32178881764411926, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0219, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.48865941166877747, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0261, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.6131434440612793, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0269, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.4471806585788727, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0251, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.8255780935287476, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0229, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.843673586845398, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0278, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.4278610348701477, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0228, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.5036011338233948, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0291, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.5141382813453674, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0217, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.8976346850395203, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0248, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.5634751319885254, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0276, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.5327013731002808, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0279, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.2723959982395172, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0225, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.4455258846282959, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0222, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.3784103989601135, + "learning_rate": 8.219774325200873e-06, + "loss": 0.024, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.8102694749832153, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0231, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.5179240703582764, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0255, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.39830490946769714, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0264, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.32860279083251953, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0241, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.5459582209587097, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0193, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.3841477036476135, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0282, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.7849119305610657, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0319, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.4457703232765198, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0279, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.30464428663253784, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0184, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 1.0635287761688232, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0265, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.33294421434402466, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0235, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.5644985437393188, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0218, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.4975566565990448, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0261, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.7503839135169983, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0218, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.35363277792930603, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0198, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.43968406319618225, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0253, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.4553394615650177, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0266, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.45489153265953064, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0264, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.424696147441864, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0209, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.4819740653038025, + "learning_rate": 8.03498318084394e-06, + "loss": 0.022, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.564834475517273, + "learning_rate": 8.025779439806006e-06, + "loss": 0.024, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.7905157804489136, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0261, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.6985124349594116, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0315, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.42378291487693787, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0237, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.5980759263038635, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0217, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.45916232466697693, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0235, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.25486481189727783, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0231, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.4072360694408417, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0261, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3813820481300354, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0209, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.3040210008621216, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0225, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.30910906195640564, + "learning_rate": 7.933935782312965e-06, + "loss": 0.026, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.6573566794395447, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0262, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.30632153153419495, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0251, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.3277539610862732, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0233, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.49589917063713074, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0211, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.4149130880832672, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0203, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.7051926851272583, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0272, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.8553881049156189, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0236, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.5676615238189697, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0242, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.29548707604408264, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0236, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.36076608300209045, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0219, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.3657922148704529, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0227, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.27593615651130676, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0251, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.35554730892181396, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0259, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.45652297139167786, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0274, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.5757999420166016, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0222, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.5138059854507446, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0216, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.338874876499176, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0232, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.48215195536613464, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0226, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.30239933729171753, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0205, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.6099343299865723, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0219, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.6730902791023254, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0239, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.4575020968914032, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0204, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.2673267424106598, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0222, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.3593531548976898, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0225, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.5385488867759705, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0248, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.3900541663169861, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0277, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.6182276010513306, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0241, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.4897976815700531, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0229, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.5717247128486633, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0273, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.4837515950202942, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0219, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.31954509019851685, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0271, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.23005163669586182, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0204, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.500217616558075, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0229, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.47326523065567017, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0203, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.5074726939201355, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0249, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.6583673357963562, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0243, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.7585731744766235, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0264, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.3782348036766052, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0216, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.43963512778282166, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0201, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.6450467109680176, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0254, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.3420482575893402, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0224, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.3532888889312744, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0216, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.32494598627090454, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0196, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.2898419499397278, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0234, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.4379838705062866, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0233, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.5390518307685852, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0169, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.3786150813102722, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0203, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.3376149833202362, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0266, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.40810349583625793, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0241, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.24485738575458527, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0199, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.4670563340187073, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0184, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.374255508184433, + "learning_rate": 7.4623904967312e-06, + "loss": 0.018, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.4191536605358124, + "learning_rate": 7.453427567620127e-06, + "loss": 0.022, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3807078003883362, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0232, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.7537381649017334, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0202, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.36507129669189453, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0236, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.24461498856544495, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0221, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.351654589176178, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0236, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.35627686977386475, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0213, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.4586603343486786, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0304, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.4082098603248596, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0237, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.47707459330558777, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0247, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.4687316119670868, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0344, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.4660017788410187, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0214, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.4644101560115814, + "learning_rate": 7.346200065486093e-06, + "loss": 0.022, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.3139079213142395, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0234, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.36445188522338867, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0262, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.6457782983779907, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0261, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.4184044599533081, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0245, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.44356703758239746, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0215, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.5394402742385864, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0302, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 0.5960429906845093, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0234, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2850514352321625, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0243, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.45071718096733093, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0233, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.3157344162464142, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0254, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.45518410205841064, + "learning_rate": 7.248450164740439e-06, + "loss": 0.024, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.2323702722787857, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0226, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.6025039553642273, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0246, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.4983830749988556, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0199, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.3684524595737457, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0252, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.36924007534980774, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0277, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.3531496822834015, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0228, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.3995579779148102, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0193, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.4124946892261505, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0221, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.3897329866886139, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0221, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.45230787992477417, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0238, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.45878538489341736, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0244, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.4302407503128052, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0237, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.30422642827033997, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0173, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.49566513299942017, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0201, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.43262094259262085, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0227, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.8250450491905212, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0259, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.3265332281589508, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0205, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.2871774435043335, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0201, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.4341558814048767, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0199, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.43365293741226196, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0201, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.5876246690750122, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0205, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.2719171643257141, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0211, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.38791123032569885, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0244, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.4082484543323517, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0206, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.5010205507278442, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0245, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.4404369294643402, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0268, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.5171347856521606, + "learning_rate": 7.010805483338283e-06, + "loss": 0.024, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.5137951970100403, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0241, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.563709557056427, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0193, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.44687238335609436, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0207, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.33815798163414, + "learning_rate": 6.975884226362e-06, + "loss": 0.0246, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.33789384365081787, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0206, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.38053908944129944, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0195, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.5730066299438477, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0199, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.42453598976135254, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0218, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.48010921478271484, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0328, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.5227254629135132, + "learning_rate": 6.923644220932124e-06, + "loss": 0.019, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4078599810600281, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0212, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.4473094046115875, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0281, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.3459968864917755, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0231, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.4205886721611023, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0256, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.5397320985794067, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0214, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.6208626627922058, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0224, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.34377506375312805, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0197, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.4086950123310089, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0202, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.5211176872253418, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0201, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3705415725708008, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0219, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.32692769169807434, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0204, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.42599135637283325, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0213, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.565449595451355, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0223, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.4027825593948364, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0233, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.4833034574985504, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0309, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.5570312738418579, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0213, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.30241742730140686, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0197, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.37468239665031433, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0214, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.5555301904678345, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0223, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.6084730625152588, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0261, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.5931955575942993, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0237, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.30350545048713684, + "learning_rate": 6.733587654719298e-06, + "loss": 0.02, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.6784055233001709, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0281, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.5559973120689392, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0204, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.7529487013816833, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0235, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.7032052874565125, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0176, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.5018401741981506, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0197, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.5020368695259094, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0231, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.3605690598487854, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0254, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.3482762575149536, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0223, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.4260469675064087, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0199, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.23622000217437744, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0239, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.3683573007583618, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0223, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.32972025871276855, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0228, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.4159783124923706, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0221, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.24288412928581238, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0188, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.42375463247299194, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0183, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.26672226190567017, + "learning_rate": 6.596880604028027e-06, + "loss": 0.02, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.30816635489463806, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0219, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.315452516078949, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0218, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.5412175059318542, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0233, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.4290241003036499, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0233, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.3977762460708618, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0239, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.4023628532886505, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0197, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.8707197308540344, + "learning_rate": 6.53748481975927e-06, + "loss": 0.029, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.37878328561782837, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0218, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.685556173324585, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0248, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.5783588886260986, + "learning_rate": 6.512107839793337e-06, + "loss": 0.02, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.5456825494766235, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0279, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.6162738800048828, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0259, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.38887348771095276, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0198, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.5207514762878418, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0201, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.671120822429657, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0259, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.28870952129364014, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0175, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.3909374177455902, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0214, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.3419650197029114, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0217, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.563515305519104, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0185, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.6295453310012817, + "learning_rate": 6.427861749601945e-06, + "loss": 0.023, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.4404713213443756, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0188, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.698448121547699, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0225, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.5679222941398621, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0213, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.5237470269203186, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0261, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.4205586016178131, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0232, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.36608314514160156, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.02, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.49511757493019104, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0247, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.3475521206855774, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0202, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.36345914006233215, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0197, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.34304162859916687, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0183, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.41459065675735474, + "learning_rate": 6.335811156758245e-06, + "loss": 0.02, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.34139952063560486, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0211, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.29463231563568115, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0225, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.37984198331832886, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0201, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.21912901103496552, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0226, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.34660178422927856, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0179, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.6080809235572815, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0187, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.43388310074806213, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0226, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.53389972448349, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0237, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.39731428027153015, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0176, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.32715681195259094, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0211, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.36709150671958923, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0194, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.5554866790771484, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0202, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.26253199577331543, + "learning_rate": 6.227878992893104e-06, + "loss": 0.02, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.3686104714870453, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0191, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.36151114106178284, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0213, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.5019435882568359, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0203, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 1.1914043426513672, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0249, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.45042529702186584, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0244, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.3239169120788574, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0219, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.3253174424171448, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0226, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.6497724652290344, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0238, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.5800855159759521, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0211, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.29717954993247986, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0198, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.35056066513061523, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0219, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.28448906540870667, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0227, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.33300310373306274, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0165, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.5134487748146057, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0219, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.45153549313545227, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0191, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.6483689546585083, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0211, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.5660327076911926, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0207, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.6027820706367493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0201, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.6102983951568604, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0207, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.4383072257041931, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0275, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.42298370599746704, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0204, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.30508092045783997, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0195, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.6242369413375854, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0215, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.38399502635002136, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0201, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.4721924066543579, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0243, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.6958035230636597, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0201, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.3826717436313629, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0236, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.3098534941673279, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0216, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.43973061442375183, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0234, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.46570682525634766, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0226, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.46847036480903625, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0188, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.5139725804328918, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0195, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.48436662554740906, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0206, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.3445553481578827, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0241, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.8473356366157532, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0248, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.6241415143013, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0242, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.7302873730659485, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0224, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.29269692301750183, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0181, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.4065910577774048, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0253, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.36930134892463684, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0203, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.5521696209907532, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0208, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.3761119544506073, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0209, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.3330603241920471, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0233, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.27771884202957153, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0162, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.4225069284439087, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0177, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.33680275082588196, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0199, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.4399181604385376, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0236, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.49677175283432007, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0265, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.39700835943222046, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0193, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.4604041278362274, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0208, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.26002946496009827, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0197, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.3256632685661316, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0192, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3573099672794342, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0184, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.3116256892681122, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0197, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.39247608184814453, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0219, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.31291085481643677, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0194, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.5996116399765015, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0264, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.24854864180088043, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0207, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.5746667385101318, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0195, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.5744135975837708, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0182, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.5161272883415222, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0212, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.5889247059822083, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0172, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.53412926197052, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0209, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.3421672582626343, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0193, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.409906268119812, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0173, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.5139239430427551, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0198, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.5014253258705139, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0177, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.5942979454994202, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0206, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.218281552195549, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0204, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.43725427985191345, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0215, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.3467969000339508, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0168, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.2697127163410187, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0214, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.43687018752098083, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0262, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.47759339213371277, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0212, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.33211249113082886, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0228, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.29453045129776, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0233, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.34539318084716797, + "learning_rate": 5.608700869895367e-06, + "loss": 0.021, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.6664339900016785, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0203, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.21404555439949036, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0209, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.4320753812789917, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0236, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.415399968624115, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0235, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.2643829584121704, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0203, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.4354988932609558, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0172, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.43992263078689575, + "learning_rate": 5.554208267666996e-06, + "loss": 0.018, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.32208460569381714, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0183, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.27261701226234436, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0196, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.4348963499069214, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0173, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.40379852056503296, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0202, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.4592876136302948, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0219, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.4797484278678894, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0182, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.47892817854881287, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0185, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.46308979392051697, + "learning_rate": 5.492314644463202e-06, + "loss": 0.018, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.7745133638381958, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0207, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.6577957272529602, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0166, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.43036580085754395, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0218, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.41811347007751465, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0214, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.31980884075164795, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0198, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.3632652461528778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0209, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.467146635055542, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0173, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.5659807920455933, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0199, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.24540813267230988, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0178, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.3122001588344574, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0222, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2879388928413391, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0173, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.5185259580612183, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0168, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.239187091588974, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0198, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3844532370567322, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0179, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.3842040002346039, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0204, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.26496851444244385, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0172, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.40850451588630676, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0189, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21669425070285797, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0192, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.43664559721946716, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.021, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.49064821004867554, + "learning_rate": 5.339400468833427e-06, + "loss": 0.02, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.9060949683189392, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0204, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.3413904309272766, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0212, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.2620849311351776, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0201, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.3972470760345459, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0216, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.4422028064727783, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0177, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.2595955431461334, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0214, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.43522438406944275, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0226, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.33024686574935913, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0199, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.3532852232456207, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0194, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.3963644802570343, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0171, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.37003734707832336, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0174, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.27832016348838806, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0211, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.4203765392303467, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0196, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.31796127557754517, + "learning_rate": 5.233937303988081e-06, + "loss": 0.019, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.4561198949813843, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0198, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.4175209403038025, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0195, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.7017586827278137, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0201, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.4711352288722992, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.02, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.2737489640712738, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0198, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.44284430146217346, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0206, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.4556163251399994, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0208, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.3158712685108185, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0156, + "step": 22000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3741821653522842e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-22000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b983150ade651abafd88a89da4fc3468ac30b730 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:703634c3a693ff94b3976aee09d7774fde850a5d5b5c478e3a25be5440cbfe42 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d5181bdf2d1e92fd09b697a1f069f4e3b23e27f4 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30263813e47923d808e717a89ab2ac924ff1d1735d9ded7d334b36019421f6f2 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..be618731337af945064e15a50cc2d2e4f17bcb63 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c747bea1a95c669ed9fd42b10868411180870bf30b0ddb47bce6fb4a3813fc68 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..487a8e434447e55774e050f446652748b5b50b24 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/trainer_state.json @@ -0,0 +1,16834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4380729822038467, + "eval_steps": 500, + "global_step": 24000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.7277513146400452, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0304, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.6415050029754639, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0292, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.48691895604133606, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0337, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.53068608045578, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0338, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.5464624762535095, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0303, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.3911614418029785, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0345, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.6894099116325378, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0365, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.5268317461013794, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0405, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.8635499477386475, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0321, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21542859077453613, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0264, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.6257337331771851, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0355, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.6525475978851318, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0304, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.4599299430847168, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0314, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.7497361898422241, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.031, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.3124896287918091, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0257, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.6170748472213745, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0323, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.4619428515434265, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0315, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.5088011026382446, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0255, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.5397948622703552, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0265, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.457082062959671, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0279, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.4131294786930084, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0269, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 1.1949660778045654, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.6057063341140747, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0306, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.26918280124664307, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0283, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.48841091990470886, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0323, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.6195886135101318, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0295, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.5798623561859131, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.031, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.4877539277076721, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0267, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.33261221647262573, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0261, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.8361077904701233, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0311, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.305922269821167, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0302, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.22662357985973358, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.028, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.4273515045642853, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0307, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.521216869354248, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0277, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.7090896368026733, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0346, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.3693661391735077, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.3651321530342102, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0263, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.5577923655509949, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0357, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.6504148840904236, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0404, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.49205282330513, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.035, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.6053458452224731, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0328, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.5949649214744568, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0302, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.5310356020927429, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0264, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4087911546230316, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.35929426550865173, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0274, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.5112904906272888, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0253, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.39148232340812683, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0305, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.47718697786331177, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.620936393737793, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0289, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.8953443169593811, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0328, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.4663226902484894, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0302, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.707167387008667, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0319, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.5325813889503479, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0318, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.6239158511161804, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0289, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.38823947310447693, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0266, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.48849165439605713, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0234, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.23214028775691986, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0276, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3467197120189667, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0282, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.2009357064962387, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0298, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.8589951395988464, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0264, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.43969056010246277, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0292, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.5750611424446106, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0289, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.5399556756019592, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0307, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.20517395436763763, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0249, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.7490189671516418, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0246, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.6661257743835449, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0325, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.571394681930542, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0342, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.8792482018470764, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0332, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.5770248770713806, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0286, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.62962406873703, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0246, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.4651380479335785, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.037, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.5087499022483826, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0265, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.44421979784965515, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0306, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.6521517038345337, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0334, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.5384942889213562, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0296, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.41909387707710266, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0297, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.6697047352790833, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0331, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.4015032947063446, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0326, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.48070228099823, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0278, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.8651071786880493, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0242, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 1.17703378200531, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0288, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.45865103602409363, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0322, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.41243845224380493, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0297, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.482997864484787, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0305, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.5319142937660217, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.6116752028465271, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0311, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.4214901328086853, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0269, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.6246733069419861, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.026, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.4263368248939514, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0305, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.4059041738510132, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.022, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.6362516283988953, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0265, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.2905973494052887, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0297, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.42270833253860474, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0255, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.26410749554634094, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0252, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.7570974230766296, + "learning_rate": 1.153689339251154e-05, + "loss": 0.027, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.5941224098205566, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.3985750079154968, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0337, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.3877560496330261, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.024, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.44742006063461304, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0284, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.3280893564224243, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0318, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0341, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.4976208806037903, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0239, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.6153465509414673, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0252, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.6112402677536011, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.4973732531070709, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0307, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.5871816277503967, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0254, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 1.2150986194610596, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.6406526565551758, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0265, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.4251798093318939, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0269, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.4702431857585907, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0311, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.3235304355621338, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0236, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.4913889467716217, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0231, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.4980977177619934, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0289, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.740922212600708, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0334, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.3305300772190094, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0301, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.7037357091903687, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0311, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.44783756136894226, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0339, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.7776843309402466, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0349, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.49181437492370605, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0285, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.333814799785614, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0284, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 1.203652262687683, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0365, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.521643877029419, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0313, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.33309581875801086, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0265, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.48567256331443787, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0357, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8473871946334839, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0355, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.43827518820762634, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0266, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.5849157571792603, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0317, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.5690399408340454, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0266, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.6484784483909607, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0294, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.8894811272621155, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0239, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4575272798538208, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0323, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.4288756847381592, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.032, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.8871303200721741, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.5861580967903137, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0335, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.4159319996833801, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0247, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.6948496699333191, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0299, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.5089551210403442, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0333, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.6912631392478943, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0303, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0295, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.4634060561656952, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0261, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.5664045214653015, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0262, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.7963227033615112, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0278, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.45378491282463074, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0268, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.8970746994018555, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0271, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.5109472274780273, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0307, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.5023297667503357, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0263, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.6055631041526794, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0285, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.38602766394615173, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0282, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.5447302460670471, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0319, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.6613780856132507, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0271, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 1.0358555316925049, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.026, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.4463629722595215, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.5373798608779907, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.025, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.7735916972160339, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0325, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.5017692446708679, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0262, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.3406142592430115, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0271, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.28971537947654724, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0238, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.45441415905952454, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0261, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.4653581976890564, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.026, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.5449947714805603, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0314, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.41015395522117615, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0272, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.5936392545700073, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0269, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0256, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.6176534295082092, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0285, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.6774734258651733, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0268, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.7045454978942871, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0305, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.5905448794364929, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0284, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.7881343364715576, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0321, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.6635507941246033, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0284, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.46298888325691223, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0394, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.5187172889709473, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0257, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.5974661707878113, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0305, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.5171123743057251, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0275, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.35988888144493103, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0295, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.30543047189712524, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0334, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.6582810878753662, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0309, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.4986134171485901, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0294, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.5560855269432068, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0224, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.28974607586860657, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0313, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.24015791714191437, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.026, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.2704199552536011, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0244, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.6661707162857056, + "learning_rate": 1.068904422762975e-05, + "loss": 0.027, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.5058556795120239, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0254, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.7086800336837769, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0242, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.6752822399139404, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0262, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.8279762268066406, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0312, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.5070614814758301, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0308, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.3933897614479065, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0287, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.37238794565200806, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0325, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.7591347098350525, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0265, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.4841652810573578, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0331, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.45236295461654663, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0412, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.4774094820022583, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0289, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.47564345598220825, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0294, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.341337651014328, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0281, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.341701865196228, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0224, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.6621959209442139, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0283, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.348466694355011, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0234, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.35208311676979065, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0248, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.4973156154155731, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0246, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.3668982982635498, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0228, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.4771873950958252, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0303, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.3595021665096283, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0265, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.6013099551200867, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0297, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.40996676683425903, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0321, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.45742037892341614, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0288, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.8092222213745117, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0278, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.32741186022758484, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0288, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.5716732740402222, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0256, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.3263239562511444, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0271, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.35390567779541016, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0266, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.36520150303840637, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0265, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.46227532625198364, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.40079647302627563, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0327, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.3689155578613281, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0249, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.49527907371520996, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.029, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.38931334018707275, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0233, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.5698918700218201, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0269, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 1.0959579944610596, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.029, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.6321646571159363, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0276, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.7166606783866882, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0292, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.6464444994926453, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0246, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.7318128347396851, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0296, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.4828032851219177, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0247, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.4509548842906952, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0241, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.413630872964859, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0313, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.42443349957466125, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0316, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.8199112415313721, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0389, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.28918105363845825, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0242, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.6759344339370728, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0308, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.5480250120162964, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.025, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.48897549510002136, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.027, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.6111220121383667, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0276, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.8852546215057373, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0251, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.5098162889480591, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.022, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.45974940061569214, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0206, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3925095200538635, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0251, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.5461363792419434, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0217, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.5685333609580994, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0231, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.494150310754776, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0243, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.8770614862442017, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0286, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.27142134308815, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0253, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.3365682363510132, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0241, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.5512370467185974, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0242, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.5581703782081604, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0276, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.306773841381073, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0262, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.44620928168296814, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0229, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.5870804786682129, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0228, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.26162099838256836, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0278, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.27250319719314575, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0293, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.8330137729644775, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0315, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.5206989645957947, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0282, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.5408382415771484, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0359, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.30517199635505676, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0267, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.5315027236938477, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0206, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.46061626076698303, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0222, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.47393080592155457, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0262, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.3686772882938385, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0254, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.3312757611274719, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0243, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.565447986125946, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0267, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.5690101385116577, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0237, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.44088438153266907, + "learning_rate": 9.911670744652783e-06, + "loss": 0.028, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.3708919882774353, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0265, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.589698851108551, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0297, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.6541375517845154, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0288, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.5304558873176575, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0243, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.5774737000465393, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0277, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.5616280436515808, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0267, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.6129759550094604, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0223, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.45278221368789673, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0304, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.44487202167510986, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0296, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.5391712188720703, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0256, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.43523359298706055, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0277, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.5308435559272766, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0242, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.3361283540725708, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0236, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.3764631450176239, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0304, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.9003425240516663, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0278, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.2787775993347168, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0219, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.40089285373687744, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0284, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.3619711101055145, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0252, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.7354542016983032, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0242, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.3854006826877594, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0302, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.3318389058113098, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0265, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.5286651849746704, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0235, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.24921932816505432, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0259, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.7376067042350769, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0238, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.35099226236343384, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0257, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.3805389702320099, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0198, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.4433703124523163, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0241, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.3667793571949005, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0268, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.2963331639766693, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0223, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.9817414879798889, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0248, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.6529688835144043, + "learning_rate": 9.612315882780393e-06, + "loss": 0.032, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.7663154602050781, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0267, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.6086964011192322, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0281, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.5240464806556702, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0339, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.6558368802070618, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0284, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.6192268133163452, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0309, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.5293763875961304, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0257, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.38831329345703125, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0239, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.12827467918396, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0323, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.411818265914917, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0274, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.5521355867385864, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0233, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.26673075556755066, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0317, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.5205486416816711, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0273, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.8010990619659424, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0292, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.420612633228302, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0274, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.4811270236968994, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0277, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.4959382712841034, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0288, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.4607725739479065, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0245, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.9101414680480957, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0283, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.38626620173454285, + "learning_rate": 9.42959233811777e-06, + "loss": 0.026, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.5709372758865356, + "learning_rate": 9.419993062475743e-06, + "loss": 0.021, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.4417913854122162, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0291, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.5651213526725769, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0228, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.4716165363788605, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0242, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.9120892286300659, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0296, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.5004292130470276, + "learning_rate": 9.372024722887089e-06, + "loss": 0.033, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.3422714173793793, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0284, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.5391610264778137, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0362, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.5446203351020813, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0247, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.5441875457763672, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0284, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.48274070024490356, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0245, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.6035326719284058, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0226, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.3104001581668854, + "learning_rate": 9.304949604077693e-06, + "loss": 0.029, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.27859869599342346, + "learning_rate": 9.295375311262483e-06, + "loss": 0.022, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.3896406292915344, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0235, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.4526473581790924, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0289, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.6624506115913391, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0265, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.6976125836372375, + "learning_rate": 9.257098257046206e-06, + "loss": 0.029, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.5974310040473938, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0205, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.7627739906311035, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0333, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.3166525065898895, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0309, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.41519322991371155, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0223, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.31840237975120544, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0239, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.47412827610969543, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0228, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.41170552372932434, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0209, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.45858854055404663, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0243, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.7870534658432007, + "learning_rate": 9.171095634265995e-06, + "loss": 0.027, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.4080354869365692, + "learning_rate": 9.161550369445782e-06, + "loss": 0.023, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.47916823625564575, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0303, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.6911760568618774, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0263, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.3980148732662201, + "learning_rate": 9.132927564918328e-06, + "loss": 0.028, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.47085851430892944, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0266, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.5085862874984741, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0239, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.5219245553016663, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0267, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.5199264287948608, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0277, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.6157195568084717, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0343, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.5366696715354919, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0271, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.3640076220035553, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0258, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.5320505499839783, + "learning_rate": 9.05669731553499e-06, + "loss": 0.024, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.507826566696167, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0253, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.741392195224762, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0242, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.5325136184692383, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0224, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.4709665775299072, + "learning_rate": 9.018636566864313e-06, + "loss": 0.026, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.4371986985206604, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0264, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.47594818472862244, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0224, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.488423228263855, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0261, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.24745763838291168, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0206, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.5042629837989807, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0305, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.5255836844444275, + "learning_rate": 8.961615424107555e-06, + "loss": 0.026, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.4605107307434082, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0274, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.3252561390399933, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0227, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.35779184103012085, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0296, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.2960403263568878, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0212, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.6344659328460693, + "learning_rate": 8.914163487132906e-06, + "loss": 0.026, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.4614463150501251, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0234, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4490053951740265, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0265, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.5291271209716797, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0326, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.5311887264251709, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0257, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.5647584199905396, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0295, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.3913862705230713, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0256, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.4476219415664673, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0248, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.7807655930519104, + "learning_rate": 8.83836825410936e-06, + "loss": 0.026, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.38984328508377075, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0247, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.5757346153259277, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0296, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.25636178255081177, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0222, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.45617344975471497, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0224, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.3066493272781372, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0237, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.26513972878456116, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0277, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.445230633020401, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0248, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.4914413392543793, + "learning_rate": 8.762735374981932e-06, + "loss": 0.022, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.41469570994377136, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0245, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.33235347270965576, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0229, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.4890037775039673, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0247, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.41330578923225403, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0285, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.6309427618980408, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0233, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.42090296745300293, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0254, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.5888519287109375, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0262, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.5488774180412292, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0262, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.48015111684799194, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0219, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.4484168291091919, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0276, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.4128018319606781, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0218, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.5151517987251282, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0242, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.6248350143432617, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0267, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.4116908013820648, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0242, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.6138579249382019, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0282, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.22843605279922485, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0284, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.49555841088294983, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0244, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.5752411484718323, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0275, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.5129706859588623, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0237, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.751230001449585, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0257, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.47749435901641846, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0277, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21702095866203308, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0255, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.30658838152885437, + "learning_rate": 8.54624657467318e-06, + "loss": 0.024, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.3589625954627991, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0215, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.5434426069259644, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0224, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.8732438683509827, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0289, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.34988290071487427, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0226, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.4021032154560089, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0248, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.4676196873188019, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0235, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.41646474599838257, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0235, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.5892519950866699, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0221, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.5757095217704773, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0258, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.4664652645587921, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0275, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.4674879014492035, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0285, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.7277936339378357, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0316, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.40373867750167847, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0213, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.8632686138153076, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0239, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.5620945692062378, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0259, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3430384695529938, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0287, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.46981969475746155, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0218, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.3494231700897217, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0238, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.514975368976593, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0205, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.6442168951034546, + "learning_rate": 8.359228888944986e-06, + "loss": 0.021, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.32178881764411926, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0219, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.48865941166877747, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0261, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.6131434440612793, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0269, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.4471806585788727, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0251, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.8255780935287476, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0229, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.843673586845398, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0278, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.4278610348701477, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0228, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.5036011338233948, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0291, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.5141382813453674, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0217, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.8976346850395203, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0248, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.5634751319885254, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0276, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.5327013731002808, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0279, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.2723959982395172, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0225, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.4455258846282959, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0222, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.3784103989601135, + "learning_rate": 8.219774325200873e-06, + "loss": 0.024, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.8102694749832153, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0231, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.5179240703582764, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0255, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.39830490946769714, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0264, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.32860279083251953, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0241, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.5459582209587097, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0193, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.3841477036476135, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0282, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.7849119305610657, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0319, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.4457703232765198, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0279, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.30464428663253784, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0184, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 1.0635287761688232, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0265, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.33294421434402466, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0235, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.5644985437393188, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0218, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.4975566565990448, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0261, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.7503839135169983, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0218, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.35363277792930603, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0198, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.43968406319618225, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0253, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.4553394615650177, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0266, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.45489153265953064, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0264, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.424696147441864, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0209, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.4819740653038025, + "learning_rate": 8.03498318084394e-06, + "loss": 0.022, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.564834475517273, + "learning_rate": 8.025779439806006e-06, + "loss": 0.024, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.7905157804489136, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0261, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.6985124349594116, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0315, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.42378291487693787, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0237, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.5980759263038635, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0217, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.45916232466697693, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0235, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.25486481189727783, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0231, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.4072360694408417, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0261, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3813820481300354, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0209, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.3040210008621216, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0225, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.30910906195640564, + "learning_rate": 7.933935782312965e-06, + "loss": 0.026, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.6573566794395447, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0262, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.30632153153419495, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0251, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.3277539610862732, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0233, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.49589917063713074, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0211, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.4149130880832672, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0203, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.7051926851272583, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0272, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.8553881049156189, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0236, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.5676615238189697, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0242, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.29548707604408264, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0236, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.36076608300209045, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0219, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.3657922148704529, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0227, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.27593615651130676, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0251, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.35554730892181396, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0259, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.45652297139167786, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0274, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.5757999420166016, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0222, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.5138059854507446, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0216, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.338874876499176, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0232, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.48215195536613464, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0226, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.30239933729171753, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0205, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.6099343299865723, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0219, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.6730902791023254, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0239, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.4575020968914032, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0204, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.2673267424106598, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0222, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.3593531548976898, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0225, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.5385488867759705, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0248, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.3900541663169861, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0277, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.6182276010513306, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0241, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.4897976815700531, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0229, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.5717247128486633, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0273, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.4837515950202942, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0219, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.31954509019851685, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0271, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.23005163669586182, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0204, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.500217616558075, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0229, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.47326523065567017, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0203, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.5074726939201355, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0249, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.6583673357963562, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0243, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.7585731744766235, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0264, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.3782348036766052, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0216, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.43963512778282166, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0201, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.6450467109680176, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0254, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.3420482575893402, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0224, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.3532888889312744, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0216, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.32494598627090454, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0196, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.2898419499397278, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0234, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.4379838705062866, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0233, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.5390518307685852, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0169, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.3786150813102722, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0203, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.3376149833202362, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0266, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.40810349583625793, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0241, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.24485738575458527, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0199, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.4670563340187073, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0184, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.374255508184433, + "learning_rate": 7.4623904967312e-06, + "loss": 0.018, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.4191536605358124, + "learning_rate": 7.453427567620127e-06, + "loss": 0.022, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3807078003883362, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0232, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.7537381649017334, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0202, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.36507129669189453, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0236, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.24461498856544495, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0221, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.351654589176178, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0236, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.35627686977386475, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0213, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.4586603343486786, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0304, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.4082098603248596, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0237, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.47707459330558777, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0247, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.4687316119670868, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0344, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.4660017788410187, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0214, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.4644101560115814, + "learning_rate": 7.346200065486093e-06, + "loss": 0.022, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.3139079213142395, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0234, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.36445188522338867, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0262, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.6457782983779907, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0261, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.4184044599533081, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0245, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.44356703758239746, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0215, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.5394402742385864, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0302, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 0.5960429906845093, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0234, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2850514352321625, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0243, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.45071718096733093, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0233, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.3157344162464142, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0254, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.45518410205841064, + "learning_rate": 7.248450164740439e-06, + "loss": 0.024, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.2323702722787857, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0226, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.6025039553642273, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0246, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.4983830749988556, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0199, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.3684524595737457, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0252, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.36924007534980774, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0277, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.3531496822834015, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0228, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.3995579779148102, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0193, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.4124946892261505, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0221, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.3897329866886139, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0221, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.45230787992477417, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0238, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.45878538489341736, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0244, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.4302407503128052, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0237, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.30422642827033997, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0173, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.49566513299942017, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0201, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.43262094259262085, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0227, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.8250450491905212, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0259, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.3265332281589508, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0205, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.2871774435043335, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0201, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.4341558814048767, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0199, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.43365293741226196, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0201, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.5876246690750122, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0205, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.2719171643257141, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0211, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.38791123032569885, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0244, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.4082484543323517, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0206, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.5010205507278442, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0245, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.4404369294643402, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0268, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.5171347856521606, + "learning_rate": 7.010805483338283e-06, + "loss": 0.024, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.5137951970100403, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0241, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.563709557056427, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0193, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.44687238335609436, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0207, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.33815798163414, + "learning_rate": 6.975884226362e-06, + "loss": 0.0246, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.33789384365081787, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0206, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.38053908944129944, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0195, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.5730066299438477, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0199, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.42453598976135254, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0218, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.48010921478271484, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0328, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.5227254629135132, + "learning_rate": 6.923644220932124e-06, + "loss": 0.019, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4078599810600281, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0212, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.4473094046115875, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0281, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.3459968864917755, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0231, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.4205886721611023, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0256, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.5397320985794067, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0214, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.6208626627922058, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0224, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.34377506375312805, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0197, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.4086950123310089, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0202, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.5211176872253418, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0201, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3705415725708008, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0219, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.32692769169807434, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0204, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.42599135637283325, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0213, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.565449595451355, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0223, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.4027825593948364, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0233, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.4833034574985504, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0309, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.5570312738418579, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0213, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.30241742730140686, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0197, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.37468239665031433, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0214, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.5555301904678345, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0223, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.6084730625152588, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0261, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.5931955575942993, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0237, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.30350545048713684, + "learning_rate": 6.733587654719298e-06, + "loss": 0.02, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.6784055233001709, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0281, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.5559973120689392, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0204, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.7529487013816833, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0235, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.7032052874565125, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0176, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.5018401741981506, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0197, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.5020368695259094, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0231, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.3605690598487854, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0254, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.3482762575149536, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0223, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.4260469675064087, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0199, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.23622000217437744, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0239, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.3683573007583618, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0223, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.32972025871276855, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0228, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.4159783124923706, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0221, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.24288412928581238, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0188, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.42375463247299194, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0183, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.26672226190567017, + "learning_rate": 6.596880604028027e-06, + "loss": 0.02, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.30816635489463806, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0219, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.315452516078949, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0218, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.5412175059318542, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0233, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.4290241003036499, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0233, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.3977762460708618, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0239, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.4023628532886505, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0197, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.8707197308540344, + "learning_rate": 6.53748481975927e-06, + "loss": 0.029, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.37878328561782837, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0218, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.685556173324585, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0248, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.5783588886260986, + "learning_rate": 6.512107839793337e-06, + "loss": 0.02, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.5456825494766235, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0279, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.6162738800048828, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0259, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.38887348771095276, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0198, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.5207514762878418, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0201, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.671120822429657, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0259, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.28870952129364014, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0175, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.3909374177455902, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0214, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.3419650197029114, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0217, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.563515305519104, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0185, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.6295453310012817, + "learning_rate": 6.427861749601945e-06, + "loss": 0.023, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.4404713213443756, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0188, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.698448121547699, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0225, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.5679222941398621, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0213, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.5237470269203186, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0261, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.4205586016178131, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0232, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.36608314514160156, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.02, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.49511757493019104, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0247, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.3475521206855774, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0202, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.36345914006233215, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0197, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.34304162859916687, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0183, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.41459065675735474, + "learning_rate": 6.335811156758245e-06, + "loss": 0.02, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.34139952063560486, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0211, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.29463231563568115, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0225, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.37984198331832886, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0201, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.21912901103496552, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0226, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.34660178422927856, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0179, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.6080809235572815, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0187, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.43388310074806213, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0226, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.53389972448349, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0237, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.39731428027153015, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0176, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.32715681195259094, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0211, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.36709150671958923, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0194, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.5554866790771484, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0202, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.26253199577331543, + "learning_rate": 6.227878992893104e-06, + "loss": 0.02, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.3686104714870453, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0191, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.36151114106178284, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0213, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.5019435882568359, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0203, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 1.1914043426513672, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0249, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.45042529702186584, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0244, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.3239169120788574, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0219, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.3253174424171448, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0226, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.6497724652290344, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0238, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.5800855159759521, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0211, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.29717954993247986, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0198, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.35056066513061523, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0219, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.28448906540870667, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0227, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.33300310373306274, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0165, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.5134487748146057, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0219, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.45153549313545227, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0191, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.6483689546585083, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0211, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.5660327076911926, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0207, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.6027820706367493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0201, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.6102983951568604, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0207, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.4383072257041931, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0275, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.42298370599746704, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0204, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.30508092045783997, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0195, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.6242369413375854, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0215, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.38399502635002136, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0201, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.4721924066543579, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0243, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.6958035230636597, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0201, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.3826717436313629, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0236, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.3098534941673279, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0216, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.43973061442375183, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0234, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.46570682525634766, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0226, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.46847036480903625, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0188, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.5139725804328918, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0195, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.48436662554740906, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0206, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.3445553481578827, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0241, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.8473356366157532, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0248, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.6241415143013, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0242, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.7302873730659485, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0224, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.29269692301750183, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0181, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.4065910577774048, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0253, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.36930134892463684, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0203, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.5521696209907532, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0208, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.3761119544506073, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0209, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.3330603241920471, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0233, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.27771884202957153, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0162, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.4225069284439087, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0177, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.33680275082588196, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0199, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.4399181604385376, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0236, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.49677175283432007, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0265, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.39700835943222046, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0193, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.4604041278362274, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0208, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.26002946496009827, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0197, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.3256632685661316, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0192, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3573099672794342, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0184, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.3116256892681122, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0197, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.39247608184814453, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0219, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.31291085481643677, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0194, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.5996116399765015, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0264, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.24854864180088043, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0207, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.5746667385101318, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0195, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.5744135975837708, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0182, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.5161272883415222, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0212, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.5889247059822083, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0172, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.53412926197052, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0209, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.3421672582626343, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0193, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.409906268119812, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0173, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.5139239430427551, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0198, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.5014253258705139, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0177, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.5942979454994202, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0206, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.218281552195549, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0204, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.43725427985191345, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0215, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.3467969000339508, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0168, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.2697127163410187, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0214, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.43687018752098083, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0262, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.47759339213371277, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0212, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.33211249113082886, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0228, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.29453045129776, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0233, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.34539318084716797, + "learning_rate": 5.608700869895367e-06, + "loss": 0.021, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.6664339900016785, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0203, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.21404555439949036, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0209, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.4320753812789917, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0236, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.415399968624115, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0235, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.2643829584121704, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0203, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.4354988932609558, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0172, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.43992263078689575, + "learning_rate": 5.554208267666996e-06, + "loss": 0.018, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.32208460569381714, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0183, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.27261701226234436, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0196, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.4348963499069214, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0173, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.40379852056503296, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0202, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.4592876136302948, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0219, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.4797484278678894, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0182, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.47892817854881287, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0185, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.46308979392051697, + "learning_rate": 5.492314644463202e-06, + "loss": 0.018, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.7745133638381958, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0207, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.6577957272529602, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0166, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.43036580085754395, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0218, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.41811347007751465, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0214, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.31980884075164795, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0198, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.3632652461528778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0209, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.467146635055542, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0173, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.5659807920455933, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0199, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.24540813267230988, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0178, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.3122001588344574, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0222, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2879388928413391, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0173, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.5185259580612183, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0168, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.239187091588974, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0198, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3844532370567322, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0179, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.3842040002346039, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0204, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.26496851444244385, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0172, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.40850451588630676, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0189, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21669425070285797, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0192, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.43664559721946716, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.021, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.49064821004867554, + "learning_rate": 5.339400468833427e-06, + "loss": 0.02, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.9060949683189392, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0204, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.3413904309272766, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0212, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.2620849311351776, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0201, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.3972470760345459, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0216, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.4422028064727783, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0177, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.2595955431461334, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0214, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.43522438406944275, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0226, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.33024686574935913, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0199, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.3532852232456207, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0194, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.3963644802570343, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0171, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.37003734707832336, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0174, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.27832016348838806, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0211, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.4203765392303467, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0196, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.31796127557754517, + "learning_rate": 5.233937303988081e-06, + "loss": 0.019, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.4561198949813843, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0198, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.4175209403038025, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0195, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.7017586827278137, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0201, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.4711352288722992, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.02, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.2737489640712738, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0198, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.44284430146217346, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0206, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.4556163251399994, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0208, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.3158712685108185, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0156, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.4620053172111511, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0187, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.7892107963562012, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0195, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.37334534525871277, + "learning_rate": 5.152002600477859e-06, + "loss": 0.02, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.4440039098262787, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0244, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.2650533616542816, + "learning_rate": 5.137194259935739e-06, + "loss": 0.017, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.5425522327423096, + "learning_rate": 5.129800405815733e-06, + "loss": 0.019, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.5764152407646179, + "learning_rate": 5.122413440701921e-06, + "loss": 0.018, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.3985585868358612, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0214, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.513511598110199, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0189, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.3784070909023285, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0164, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7029585242271423, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0201, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.28351524472236633, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0207, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.5500089526176453, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0222, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.35926392674446106, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0195, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.24845866858959198, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0198, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.3264683485031128, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0178, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.47955816984176636, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0206, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.31802570819854736, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0168, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.40685755014419556, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0223, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.4924621284008026, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0195, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.640724241733551, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0183, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.6712080836296082, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0196, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.34785783290863037, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0174, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.46851038932800293, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0186, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.6138949394226074, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0197, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.3083338439464569, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0179, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.3143295347690582, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0217, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.3330692946910858, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0149, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.2732333242893219, + "learning_rate": 4.961660586405147e-06, + "loss": 0.017, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.3350054621696472, + "learning_rate": 4.954434444590436e-06, + "loss": 0.022, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.2735322415828705, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0181, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.5919206738471985, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0201, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.28201058506965637, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0188, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.505592942237854, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0188, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.5231548547744751, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0184, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.3743092715740204, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0176, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.5908241271972656, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0224, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.4231952428817749, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0177, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.5666583180427551, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0218, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.4740161597728729, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0179, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.3947773873806, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.02, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.3114109933376312, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0223, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.44969403743743896, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0169, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.29602059721946716, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0168, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.3884619474411011, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0205, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.2929127514362335, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0149, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.4955149292945862, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0213, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.4021163582801819, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0192, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.2945493757724762, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.02, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34085726737976074, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0286, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.32751014828681946, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0226, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.3844929337501526, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0155, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.5286590456962585, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0229, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.26664429903030396, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0151, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.528367280960083, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0239, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5871155858039856, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0196, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.5686034560203552, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0184, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.40526366233825684, + "learning_rate": 4.755013723146175e-06, + "loss": 0.018, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.37055784463882446, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0184, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.5210561156272888, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0165, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.3386324942111969, + "learning_rate": 4.733984792194363e-06, + "loss": 0.018, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.40071168541908264, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0198, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.3415983319282532, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0168, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.3700709939002991, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0166, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.3559338450431824, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0174, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.5588265657424927, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0207, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.4539838433265686, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0164, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.34879690408706665, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0165, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.22862373292446136, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0158, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.5536275506019592, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0137, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5599532127380371, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0206, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.2961312532424927, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0138, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.5834526419639587, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0174, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.5941792726516724, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0205, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.2580801844596863, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0199, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3897567689418793, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0168, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.37937042117118835, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0213, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.3964179456233978, + "learning_rate": 4.616077433849538e-06, + "loss": 0.019, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.3632303476333618, + "learning_rate": 4.609208744970524e-06, + "loss": 0.015, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.5750122666358948, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0168, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.36310067772865295, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0172, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.5438339114189148, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0198, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.37394630908966064, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0202, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.2454962432384491, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0188, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.474844366312027, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0223, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.30256277322769165, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0188, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.500045657157898, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0179, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.609107494354248, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0182, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.20867787301540375, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0168, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.41653770208358765, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0172, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.357435941696167, + "learning_rate": 4.527371771040039e-06, + "loss": 0.017, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.5994096994400024, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0177, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.3150171935558319, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0164, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.4483601748943329, + "learning_rate": 4.507082898761475e-06, + "loss": 0.019, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.529812753200531, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0175, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.26758334040641785, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0187, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.3228643834590912, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0195, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.3437839150428772, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0207, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.28592896461486816, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0185, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.5544041991233826, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0191, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 1.0831762552261353, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0237, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.3546636700630188, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0203, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.32998642325401306, + "learning_rate": 4.446628604336844e-06, + "loss": 0.018, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.40987834334373474, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0189, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.6094655990600586, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0174, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.631481409072876, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0179, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.4069002866744995, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0192, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.36600202322006226, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0196, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.3092246353626251, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0185, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2811580300331116, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0162, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.4177345037460327, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0196, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.40211164951324463, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0199, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.31014713644981384, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0173, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.5378808379173279, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0189, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.3483606278896332, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0201, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.5112893581390381, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0209, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.26471400260925293, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.013, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.6770564317703247, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0174, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.4251134693622589, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0169, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.2985415458679199, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0212, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.4635870158672333, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0164, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.4360525906085968, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0174, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.6121042370796204, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0176, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.3049333095550537, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0162, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.46471482515335083, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0193, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.27093327045440674, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0204, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.3513331711292267, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0209, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.3452320396900177, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0172, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.44609951972961426, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0198, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.27217286825180054, + "learning_rate": 4.269026084410863e-06, + "loss": 0.016, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.5857428908348083, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0206, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.3834620714187622, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0165, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.34176892042160034, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0156, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.2497260719537735, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0183, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.3003418743610382, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0188, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.19922316074371338, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0162, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.5160003900527954, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0181, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.4917953312397003, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0197, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.2868032455444336, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0192, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.30980560183525085, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0178, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.31523144245147705, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0193, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.23731909692287445, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0171, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.4911767542362213, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0171, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.3095512390136719, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0165, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.6421821117401123, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0178, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.4887765645980835, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0212, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.4543951451778412, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0165, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.4595223367214203, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0144, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.6325511336326599, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0203, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.6220779418945312, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0225, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.3728989362716675, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0202, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.4958861470222473, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0204, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.32445529103279114, + "learning_rate": 4.122270968037107e-06, + "loss": 0.016, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.3969140350818634, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0174, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.39698946475982666, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0163, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.4633882939815521, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0179, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.36993899941444397, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0216, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.4137882590293884, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0187, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.320867121219635, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0238, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.3139745593070984, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0175, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.572628378868103, + "learning_rate": 4.072221948222934e-06, + "loss": 0.018, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.575975239276886, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0189, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.26301854848861694, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0121, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.3042408525943756, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0185, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.2503415644168854, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0208, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.3556166887283325, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0202, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.652975857257843, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0194, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.4215060770511627, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0166, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.2277296483516693, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0172, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.3370293378829956, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0201, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.4235946834087372, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0189, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 1.0387974977493286, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0176, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.7258256077766418, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0204, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.35412806272506714, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0165, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.5192556977272034, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0166, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.3292843699455261, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0163, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.46782153844833374, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0174, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.6324945092201233, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0183, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.4347882568836212, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0138, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.3393082320690155, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0155, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.28411221504211426, + "learning_rate": 3.949383948670156e-06, + "loss": 0.016, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.45982369780540466, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0134, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.32810381054878235, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0163, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.5996097922325134, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0246, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.40002167224884033, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0158, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.4102090299129486, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0179, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.44915929436683655, + "learning_rate": 3.913175335139808e-06, + "loss": 0.019, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.251206636428833, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0183, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.2564012408256531, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0182, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.431265652179718, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0177, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.42389997839927673, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0146, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.9380725622177124, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0206, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.3655669093132019, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0151, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.3248157501220703, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0152, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.5733596086502075, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0175, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.4672720730304718, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0185, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.22989575564861298, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0165, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 1.0956321954727173, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0181, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.39079031348228455, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0212, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.3974068760871887, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0197, + "step": 24000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4991099461999e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-24000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c83694646f7b7f3baaa23d029326c63a425243a6 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e927b75b9bcb04a997c476d4218fba019397d2b9f4d26d51ed98d5d281641f4 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2a6c0ffd66642efc789566460eb088269d7e6538 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18b2b1b6e5a878109122781f1bbb07d2cec06a23444af34688b3f2907dee6564 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e618e2c6854f93d4d6b0edbbfcdb9cb6bc8621e1 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ca5fd0e39e8e5d0f5830c7020e6ec66acf15b6e24119e38445ab9ec39fd2308 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0e927363e2e68971dd47f38683d15522ea7b289b --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/trainer_state.json @@ -0,0 +1,18234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.5579123973875006, + "eval_steps": 500, + "global_step": 26000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.7277513146400452, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0304, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.6415050029754639, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0292, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.48691895604133606, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0337, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.53068608045578, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0338, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.5464624762535095, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0303, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.3911614418029785, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0345, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.6894099116325378, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0365, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.5268317461013794, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0405, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.8635499477386475, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0321, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21542859077453613, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0264, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.6257337331771851, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0355, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.6525475978851318, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0304, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.4599299430847168, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0314, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.7497361898422241, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.031, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.3124896287918091, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0257, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.6170748472213745, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0323, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.4619428515434265, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0315, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.5088011026382446, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0255, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.5397948622703552, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0265, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.457082062959671, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0279, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.4131294786930084, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0269, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 1.1949660778045654, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.6057063341140747, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0306, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.26918280124664307, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0283, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.48841091990470886, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0323, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.6195886135101318, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0295, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.5798623561859131, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.031, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.4877539277076721, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0267, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.33261221647262573, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0261, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.8361077904701233, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0311, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.305922269821167, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0302, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.22662357985973358, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.028, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.4273515045642853, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0307, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.521216869354248, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0277, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.7090896368026733, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0346, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.3693661391735077, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.3651321530342102, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0263, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.5577923655509949, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0357, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.6504148840904236, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0404, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.49205282330513, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.035, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.6053458452224731, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0328, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.5949649214744568, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0302, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.5310356020927429, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0264, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4087911546230316, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.35929426550865173, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0274, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.5112904906272888, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0253, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.39148232340812683, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0305, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.47718697786331177, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.620936393737793, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0289, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.8953443169593811, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0328, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.4663226902484894, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0302, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.707167387008667, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0319, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.5325813889503479, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0318, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.6239158511161804, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0289, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.38823947310447693, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0266, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.48849165439605713, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0234, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.23214028775691986, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0276, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3467197120189667, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0282, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.2009357064962387, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0298, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.8589951395988464, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0264, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.43969056010246277, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0292, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.5750611424446106, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0289, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.5399556756019592, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0307, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.20517395436763763, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0249, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.7490189671516418, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0246, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.6661257743835449, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0325, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.571394681930542, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0342, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.8792482018470764, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0332, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.5770248770713806, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0286, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.62962406873703, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0246, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.4651380479335785, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.037, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.5087499022483826, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0265, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.44421979784965515, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0306, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.6521517038345337, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0334, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.5384942889213562, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0296, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.41909387707710266, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0297, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.6697047352790833, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0331, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.4015032947063446, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0326, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.48070228099823, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0278, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.8651071786880493, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0242, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 1.17703378200531, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0288, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.45865103602409363, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0322, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.41243845224380493, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0297, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.482997864484787, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0305, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.5319142937660217, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.6116752028465271, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0311, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.4214901328086853, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0269, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.6246733069419861, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.026, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.4263368248939514, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0305, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.4059041738510132, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.022, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.6362516283988953, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0265, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.2905973494052887, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0297, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.42270833253860474, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0255, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.26410749554634094, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0252, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.7570974230766296, + "learning_rate": 1.153689339251154e-05, + "loss": 0.027, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.5941224098205566, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.3985750079154968, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0337, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.3877560496330261, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.024, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.44742006063461304, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0284, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.3280893564224243, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0318, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0341, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.4976208806037903, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0239, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.6153465509414673, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0252, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.6112402677536011, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.4973732531070709, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0307, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.5871816277503967, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0254, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 1.2150986194610596, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.6406526565551758, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0265, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.4251798093318939, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0269, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.4702431857585907, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0311, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.3235304355621338, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0236, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.4913889467716217, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0231, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.4980977177619934, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0289, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.740922212600708, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0334, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.3305300772190094, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0301, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.7037357091903687, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0311, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.44783756136894226, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0339, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.7776843309402466, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0349, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.49181437492370605, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0285, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.333814799785614, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0284, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 1.203652262687683, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0365, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.521643877029419, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0313, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.33309581875801086, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0265, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.48567256331443787, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0357, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8473871946334839, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0355, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.43827518820762634, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0266, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.5849157571792603, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0317, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.5690399408340454, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0266, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.6484784483909607, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0294, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.8894811272621155, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0239, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4575272798538208, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0323, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.4288756847381592, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.032, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.8871303200721741, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.5861580967903137, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0335, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.4159319996833801, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0247, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.6948496699333191, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0299, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.5089551210403442, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0333, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.6912631392478943, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0303, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0295, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.4634060561656952, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0261, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.5664045214653015, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0262, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.7963227033615112, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0278, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.45378491282463074, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0268, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.8970746994018555, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0271, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.5109472274780273, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0307, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.5023297667503357, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0263, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.6055631041526794, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0285, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.38602766394615173, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0282, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.5447302460670471, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0319, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.6613780856132507, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0271, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 1.0358555316925049, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.026, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.4463629722595215, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.5373798608779907, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.025, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.7735916972160339, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0325, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.5017692446708679, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0262, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.3406142592430115, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0271, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.28971537947654724, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0238, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.45441415905952454, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0261, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.4653581976890564, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.026, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.5449947714805603, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0314, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.41015395522117615, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0272, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.5936392545700073, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0269, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0256, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.6176534295082092, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0285, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.6774734258651733, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0268, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.7045454978942871, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0305, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.5905448794364929, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0284, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.7881343364715576, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0321, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.6635507941246033, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0284, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.46298888325691223, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0394, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.5187172889709473, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0257, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.5974661707878113, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0305, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.5171123743057251, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0275, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.35988888144493103, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0295, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.30543047189712524, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0334, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.6582810878753662, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0309, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.4986134171485901, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0294, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.5560855269432068, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0224, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.28974607586860657, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0313, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.24015791714191437, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.026, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.2704199552536011, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0244, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.6661707162857056, + "learning_rate": 1.068904422762975e-05, + "loss": 0.027, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.5058556795120239, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0254, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.7086800336837769, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0242, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.6752822399139404, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0262, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.8279762268066406, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0312, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.5070614814758301, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0308, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.3933897614479065, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0287, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.37238794565200806, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0325, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.7591347098350525, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0265, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.4841652810573578, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0331, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.45236295461654663, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0412, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.4774094820022583, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0289, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.47564345598220825, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0294, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.341337651014328, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0281, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.341701865196228, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0224, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.6621959209442139, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0283, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.348466694355011, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0234, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.35208311676979065, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0248, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.4973156154155731, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0246, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.3668982982635498, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0228, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.4771873950958252, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0303, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.3595021665096283, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0265, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.6013099551200867, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0297, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.40996676683425903, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0321, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.45742037892341614, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0288, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.8092222213745117, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0278, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.32741186022758484, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0288, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.5716732740402222, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0256, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.3263239562511444, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0271, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.35390567779541016, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0266, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.36520150303840637, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0265, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.46227532625198364, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.40079647302627563, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0327, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.3689155578613281, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0249, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.49527907371520996, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.029, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.38931334018707275, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0233, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.5698918700218201, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0269, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 1.0959579944610596, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.029, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.6321646571159363, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0276, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.7166606783866882, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0292, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.6464444994926453, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0246, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.7318128347396851, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0296, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.4828032851219177, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0247, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.4509548842906952, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0241, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.413630872964859, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0313, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.42443349957466125, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0316, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.8199112415313721, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0389, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.28918105363845825, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0242, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.6759344339370728, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0308, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.5480250120162964, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.025, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.48897549510002136, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.027, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.6111220121383667, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0276, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.8852546215057373, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0251, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.5098162889480591, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.022, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.45974940061569214, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0206, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3925095200538635, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0251, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.5461363792419434, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0217, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.5685333609580994, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0231, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.494150310754776, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0243, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.8770614862442017, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0286, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.27142134308815, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0253, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.3365682363510132, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0241, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.5512370467185974, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0242, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.5581703782081604, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0276, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.306773841381073, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0262, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.44620928168296814, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0229, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.5870804786682129, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0228, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.26162099838256836, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0278, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.27250319719314575, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0293, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.8330137729644775, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0315, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.5206989645957947, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0282, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.5408382415771484, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0359, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.30517199635505676, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0267, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.5315027236938477, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0206, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.46061626076698303, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0222, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.47393080592155457, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0262, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.3686772882938385, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0254, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.3312757611274719, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0243, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.565447986125946, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0267, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.5690101385116577, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0237, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.44088438153266907, + "learning_rate": 9.911670744652783e-06, + "loss": 0.028, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.3708919882774353, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0265, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.589698851108551, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0297, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.6541375517845154, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0288, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.5304558873176575, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0243, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.5774737000465393, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0277, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.5616280436515808, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0267, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.6129759550094604, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0223, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.45278221368789673, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0304, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.44487202167510986, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0296, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.5391712188720703, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0256, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.43523359298706055, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0277, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.5308435559272766, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0242, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.3361283540725708, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0236, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.3764631450176239, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0304, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.9003425240516663, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0278, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.2787775993347168, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0219, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.40089285373687744, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0284, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.3619711101055145, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0252, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.7354542016983032, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0242, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.3854006826877594, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0302, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.3318389058113098, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0265, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.5286651849746704, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0235, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.24921932816505432, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0259, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.7376067042350769, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0238, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.35099226236343384, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0257, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.3805389702320099, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0198, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.4433703124523163, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0241, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.3667793571949005, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0268, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.2963331639766693, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0223, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.9817414879798889, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0248, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.6529688835144043, + "learning_rate": 9.612315882780393e-06, + "loss": 0.032, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.7663154602050781, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0267, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.6086964011192322, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0281, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.5240464806556702, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0339, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.6558368802070618, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0284, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.6192268133163452, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0309, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.5293763875961304, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0257, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.38831329345703125, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0239, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.12827467918396, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0323, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.411818265914917, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0274, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.5521355867385864, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0233, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.26673075556755066, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0317, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.5205486416816711, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0273, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.8010990619659424, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0292, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.420612633228302, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0274, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.4811270236968994, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0277, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.4959382712841034, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0288, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.4607725739479065, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0245, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.9101414680480957, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0283, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.38626620173454285, + "learning_rate": 9.42959233811777e-06, + "loss": 0.026, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.5709372758865356, + "learning_rate": 9.419993062475743e-06, + "loss": 0.021, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.4417913854122162, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0291, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.5651213526725769, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0228, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.4716165363788605, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0242, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.9120892286300659, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0296, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.5004292130470276, + "learning_rate": 9.372024722887089e-06, + "loss": 0.033, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.3422714173793793, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0284, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.5391610264778137, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0362, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.5446203351020813, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0247, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.5441875457763672, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0284, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.48274070024490356, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0245, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.6035326719284058, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0226, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.3104001581668854, + "learning_rate": 9.304949604077693e-06, + "loss": 0.029, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.27859869599342346, + "learning_rate": 9.295375311262483e-06, + "loss": 0.022, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.3896406292915344, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0235, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.4526473581790924, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0289, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.6624506115913391, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0265, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.6976125836372375, + "learning_rate": 9.257098257046206e-06, + "loss": 0.029, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.5974310040473938, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0205, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.7627739906311035, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0333, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.3166525065898895, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0309, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.41519322991371155, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0223, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.31840237975120544, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0239, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.47412827610969543, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0228, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.41170552372932434, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0209, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.45858854055404663, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0243, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.7870534658432007, + "learning_rate": 9.171095634265995e-06, + "loss": 0.027, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.4080354869365692, + "learning_rate": 9.161550369445782e-06, + "loss": 0.023, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.47916823625564575, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0303, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.6911760568618774, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0263, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.3980148732662201, + "learning_rate": 9.132927564918328e-06, + "loss": 0.028, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.47085851430892944, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0266, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.5085862874984741, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0239, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.5219245553016663, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0267, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.5199264287948608, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0277, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.6157195568084717, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0343, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.5366696715354919, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0271, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.3640076220035553, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0258, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.5320505499839783, + "learning_rate": 9.05669731553499e-06, + "loss": 0.024, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.507826566696167, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0253, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.741392195224762, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0242, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.5325136184692383, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0224, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.4709665775299072, + "learning_rate": 9.018636566864313e-06, + "loss": 0.026, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.4371986985206604, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0264, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.47594818472862244, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0224, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.488423228263855, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0261, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.24745763838291168, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0206, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.5042629837989807, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0305, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.5255836844444275, + "learning_rate": 8.961615424107555e-06, + "loss": 0.026, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.4605107307434082, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0274, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.3252561390399933, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0227, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.35779184103012085, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0296, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.2960403263568878, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0212, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.6344659328460693, + "learning_rate": 8.914163487132906e-06, + "loss": 0.026, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.4614463150501251, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0234, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4490053951740265, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0265, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.5291271209716797, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0326, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.5311887264251709, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0257, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.5647584199905396, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0295, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.3913862705230713, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0256, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.4476219415664673, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0248, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.7807655930519104, + "learning_rate": 8.83836825410936e-06, + "loss": 0.026, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.38984328508377075, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0247, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.5757346153259277, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0296, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.25636178255081177, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0222, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.45617344975471497, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0224, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.3066493272781372, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0237, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.26513972878456116, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0277, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.445230633020401, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0248, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.4914413392543793, + "learning_rate": 8.762735374981932e-06, + "loss": 0.022, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.41469570994377136, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0245, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.33235347270965576, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0229, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.4890037775039673, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0247, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.41330578923225403, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0285, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.6309427618980408, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0233, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.42090296745300293, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0254, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.5888519287109375, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0262, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.5488774180412292, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0262, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.48015111684799194, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0219, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.4484168291091919, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0276, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.4128018319606781, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0218, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.5151517987251282, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0242, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.6248350143432617, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0267, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.4116908013820648, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0242, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.6138579249382019, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0282, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.22843605279922485, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0284, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.49555841088294983, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0244, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.5752411484718323, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0275, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.5129706859588623, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0237, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.751230001449585, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0257, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.47749435901641846, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0277, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21702095866203308, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0255, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.30658838152885437, + "learning_rate": 8.54624657467318e-06, + "loss": 0.024, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.3589625954627991, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0215, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.5434426069259644, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0224, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.8732438683509827, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0289, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.34988290071487427, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0226, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.4021032154560089, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0248, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.4676196873188019, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0235, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.41646474599838257, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0235, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.5892519950866699, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0221, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.5757095217704773, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0258, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.4664652645587921, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0275, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.4674879014492035, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0285, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.7277936339378357, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0316, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.40373867750167847, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0213, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.8632686138153076, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0239, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.5620945692062378, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0259, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3430384695529938, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0287, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.46981969475746155, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0218, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.3494231700897217, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0238, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.514975368976593, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0205, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.6442168951034546, + "learning_rate": 8.359228888944986e-06, + "loss": 0.021, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.32178881764411926, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0219, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.48865941166877747, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0261, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.6131434440612793, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0269, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.4471806585788727, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0251, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.8255780935287476, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0229, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.843673586845398, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0278, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.4278610348701477, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0228, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.5036011338233948, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0291, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.5141382813453674, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0217, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.8976346850395203, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0248, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.5634751319885254, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0276, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.5327013731002808, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0279, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.2723959982395172, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0225, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.4455258846282959, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0222, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.3784103989601135, + "learning_rate": 8.219774325200873e-06, + "loss": 0.024, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.8102694749832153, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0231, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.5179240703582764, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0255, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.39830490946769714, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0264, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.32860279083251953, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0241, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.5459582209587097, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0193, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.3841477036476135, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0282, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.7849119305610657, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0319, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.4457703232765198, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0279, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.30464428663253784, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0184, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 1.0635287761688232, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0265, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.33294421434402466, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0235, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.5644985437393188, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0218, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.4975566565990448, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0261, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.7503839135169983, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0218, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.35363277792930603, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0198, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.43968406319618225, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0253, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.4553394615650177, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0266, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.45489153265953064, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0264, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.424696147441864, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0209, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.4819740653038025, + "learning_rate": 8.03498318084394e-06, + "loss": 0.022, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.564834475517273, + "learning_rate": 8.025779439806006e-06, + "loss": 0.024, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.7905157804489136, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0261, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.6985124349594116, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0315, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.42378291487693787, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0237, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.5980759263038635, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0217, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.45916232466697693, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0235, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.25486481189727783, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0231, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.4072360694408417, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0261, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3813820481300354, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0209, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.3040210008621216, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0225, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.30910906195640564, + "learning_rate": 7.933935782312965e-06, + "loss": 0.026, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.6573566794395447, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0262, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.30632153153419495, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0251, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.3277539610862732, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0233, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.49589917063713074, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0211, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.4149130880832672, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0203, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.7051926851272583, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0272, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.8553881049156189, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0236, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.5676615238189697, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0242, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.29548707604408264, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0236, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.36076608300209045, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0219, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.3657922148704529, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0227, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.27593615651130676, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0251, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.35554730892181396, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0259, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.45652297139167786, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0274, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.5757999420166016, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0222, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.5138059854507446, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0216, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.338874876499176, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0232, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.48215195536613464, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0226, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.30239933729171753, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0205, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.6099343299865723, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0219, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.6730902791023254, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0239, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.4575020968914032, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0204, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.2673267424106598, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0222, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.3593531548976898, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0225, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.5385488867759705, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0248, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.3900541663169861, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0277, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.6182276010513306, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0241, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.4897976815700531, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0229, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.5717247128486633, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0273, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.4837515950202942, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0219, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.31954509019851685, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0271, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.23005163669586182, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0204, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.500217616558075, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0229, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.47326523065567017, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0203, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.5074726939201355, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0249, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.6583673357963562, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0243, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.7585731744766235, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0264, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.3782348036766052, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0216, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.43963512778282166, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0201, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.6450467109680176, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0254, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.3420482575893402, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0224, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.3532888889312744, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0216, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.32494598627090454, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0196, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.2898419499397278, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0234, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.4379838705062866, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0233, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.5390518307685852, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0169, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.3786150813102722, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0203, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.3376149833202362, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0266, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.40810349583625793, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0241, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.24485738575458527, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0199, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.4670563340187073, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0184, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.374255508184433, + "learning_rate": 7.4623904967312e-06, + "loss": 0.018, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.4191536605358124, + "learning_rate": 7.453427567620127e-06, + "loss": 0.022, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3807078003883362, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0232, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.7537381649017334, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0202, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.36507129669189453, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0236, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.24461498856544495, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0221, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.351654589176178, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0236, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.35627686977386475, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0213, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.4586603343486786, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0304, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.4082098603248596, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0237, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.47707459330558777, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0247, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.4687316119670868, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0344, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.4660017788410187, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0214, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.4644101560115814, + "learning_rate": 7.346200065486093e-06, + "loss": 0.022, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.3139079213142395, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0234, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.36445188522338867, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0262, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.6457782983779907, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0261, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.4184044599533081, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0245, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.44356703758239746, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0215, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.5394402742385864, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0302, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 0.5960429906845093, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0234, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2850514352321625, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0243, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.45071718096733093, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0233, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.3157344162464142, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0254, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.45518410205841064, + "learning_rate": 7.248450164740439e-06, + "loss": 0.024, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.2323702722787857, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0226, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.6025039553642273, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0246, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.4983830749988556, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0199, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.3684524595737457, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0252, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.36924007534980774, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0277, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.3531496822834015, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0228, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.3995579779148102, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0193, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.4124946892261505, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0221, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.3897329866886139, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0221, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.45230787992477417, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0238, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.45878538489341736, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0244, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.4302407503128052, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0237, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.30422642827033997, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0173, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.49566513299942017, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0201, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.43262094259262085, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0227, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.8250450491905212, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0259, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.3265332281589508, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0205, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.2871774435043335, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0201, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.4341558814048767, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0199, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.43365293741226196, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0201, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.5876246690750122, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0205, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.2719171643257141, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0211, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.38791123032569885, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0244, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.4082484543323517, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0206, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.5010205507278442, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0245, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.4404369294643402, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0268, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.5171347856521606, + "learning_rate": 7.010805483338283e-06, + "loss": 0.024, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.5137951970100403, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0241, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.563709557056427, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0193, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.44687238335609436, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0207, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.33815798163414, + "learning_rate": 6.975884226362e-06, + "loss": 0.0246, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.33789384365081787, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0206, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.38053908944129944, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0195, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.5730066299438477, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0199, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.42453598976135254, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0218, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.48010921478271484, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0328, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.5227254629135132, + "learning_rate": 6.923644220932124e-06, + "loss": 0.019, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4078599810600281, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0212, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.4473094046115875, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0281, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.3459968864917755, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0231, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.4205886721611023, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0256, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.5397320985794067, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0214, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.6208626627922058, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0224, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.34377506375312805, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0197, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.4086950123310089, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0202, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.5211176872253418, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0201, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3705415725708008, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0219, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.32692769169807434, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0204, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.42599135637283325, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0213, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.565449595451355, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0223, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.4027825593948364, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0233, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.4833034574985504, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0309, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.5570312738418579, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0213, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.30241742730140686, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0197, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.37468239665031433, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0214, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.5555301904678345, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0223, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.6084730625152588, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0261, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.5931955575942993, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0237, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.30350545048713684, + "learning_rate": 6.733587654719298e-06, + "loss": 0.02, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.6784055233001709, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0281, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.5559973120689392, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0204, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.7529487013816833, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0235, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.7032052874565125, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0176, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.5018401741981506, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0197, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.5020368695259094, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0231, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.3605690598487854, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0254, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.3482762575149536, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0223, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.4260469675064087, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0199, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.23622000217437744, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0239, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.3683573007583618, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0223, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.32972025871276855, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0228, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.4159783124923706, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0221, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.24288412928581238, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0188, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.42375463247299194, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0183, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.26672226190567017, + "learning_rate": 6.596880604028027e-06, + "loss": 0.02, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.30816635489463806, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0219, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.315452516078949, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0218, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.5412175059318542, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0233, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.4290241003036499, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0233, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.3977762460708618, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0239, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.4023628532886505, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0197, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.8707197308540344, + "learning_rate": 6.53748481975927e-06, + "loss": 0.029, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.37878328561782837, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0218, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.685556173324585, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0248, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.5783588886260986, + "learning_rate": 6.512107839793337e-06, + "loss": 0.02, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.5456825494766235, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0279, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.6162738800048828, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0259, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.38887348771095276, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0198, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.5207514762878418, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0201, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.671120822429657, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0259, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.28870952129364014, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0175, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.3909374177455902, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0214, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.3419650197029114, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0217, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.563515305519104, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0185, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.6295453310012817, + "learning_rate": 6.427861749601945e-06, + "loss": 0.023, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.4404713213443756, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0188, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.698448121547699, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0225, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.5679222941398621, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0213, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.5237470269203186, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0261, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.4205586016178131, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0232, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.36608314514160156, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.02, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.49511757493019104, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0247, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.3475521206855774, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0202, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.36345914006233215, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0197, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.34304162859916687, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0183, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.41459065675735474, + "learning_rate": 6.335811156758245e-06, + "loss": 0.02, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.34139952063560486, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0211, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.29463231563568115, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0225, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.37984198331832886, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0201, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.21912901103496552, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0226, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.34660178422927856, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0179, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.6080809235572815, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0187, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.43388310074806213, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0226, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.53389972448349, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0237, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.39731428027153015, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0176, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.32715681195259094, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0211, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.36709150671958923, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0194, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.5554866790771484, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0202, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.26253199577331543, + "learning_rate": 6.227878992893104e-06, + "loss": 0.02, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.3686104714870453, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0191, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.36151114106178284, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0213, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.5019435882568359, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0203, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 1.1914043426513672, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0249, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.45042529702186584, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0244, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.3239169120788574, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0219, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.3253174424171448, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0226, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.6497724652290344, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0238, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.5800855159759521, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0211, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.29717954993247986, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0198, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.35056066513061523, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0219, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.28448906540870667, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0227, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.33300310373306274, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0165, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.5134487748146057, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0219, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.45153549313545227, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0191, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.6483689546585083, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0211, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.5660327076911926, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0207, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.6027820706367493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0201, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.6102983951568604, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0207, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.4383072257041931, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0275, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.42298370599746704, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0204, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.30508092045783997, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0195, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.6242369413375854, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0215, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.38399502635002136, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0201, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.4721924066543579, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0243, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.6958035230636597, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0201, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.3826717436313629, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0236, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.3098534941673279, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0216, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.43973061442375183, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0234, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.46570682525634766, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0226, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.46847036480903625, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0188, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.5139725804328918, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0195, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.48436662554740906, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0206, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.3445553481578827, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0241, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.8473356366157532, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0248, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.6241415143013, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0242, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.7302873730659485, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0224, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.29269692301750183, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0181, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.4065910577774048, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0253, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.36930134892463684, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0203, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.5521696209907532, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0208, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.3761119544506073, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0209, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.3330603241920471, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0233, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.27771884202957153, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0162, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.4225069284439087, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0177, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.33680275082588196, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0199, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.4399181604385376, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0236, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.49677175283432007, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0265, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.39700835943222046, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0193, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.4604041278362274, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0208, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.26002946496009827, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0197, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.3256632685661316, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0192, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3573099672794342, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0184, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.3116256892681122, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0197, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.39247608184814453, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0219, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.31291085481643677, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0194, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.5996116399765015, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0264, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.24854864180088043, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0207, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.5746667385101318, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0195, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.5744135975837708, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0182, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.5161272883415222, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0212, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.5889247059822083, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0172, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.53412926197052, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0209, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.3421672582626343, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0193, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.409906268119812, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0173, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.5139239430427551, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0198, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.5014253258705139, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0177, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.5942979454994202, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0206, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.218281552195549, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0204, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.43725427985191345, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0215, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.3467969000339508, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0168, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.2697127163410187, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0214, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.43687018752098083, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0262, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.47759339213371277, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0212, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.33211249113082886, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0228, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.29453045129776, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0233, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.34539318084716797, + "learning_rate": 5.608700869895367e-06, + "loss": 0.021, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.6664339900016785, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0203, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.21404555439949036, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0209, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.4320753812789917, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0236, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.415399968624115, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0235, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.2643829584121704, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0203, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.4354988932609558, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0172, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.43992263078689575, + "learning_rate": 5.554208267666996e-06, + "loss": 0.018, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.32208460569381714, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0183, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.27261701226234436, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0196, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.4348963499069214, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0173, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.40379852056503296, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0202, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.4592876136302948, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0219, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.4797484278678894, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0182, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.47892817854881287, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0185, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.46308979392051697, + "learning_rate": 5.492314644463202e-06, + "loss": 0.018, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.7745133638381958, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0207, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.6577957272529602, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0166, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.43036580085754395, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0218, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.41811347007751465, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0214, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.31980884075164795, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0198, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.3632652461528778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0209, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.467146635055542, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0173, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.5659807920455933, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0199, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.24540813267230988, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0178, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.3122001588344574, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0222, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2879388928413391, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0173, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.5185259580612183, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0168, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.239187091588974, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0198, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3844532370567322, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0179, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.3842040002346039, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0204, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.26496851444244385, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0172, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.40850451588630676, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0189, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21669425070285797, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0192, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.43664559721946716, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.021, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.49064821004867554, + "learning_rate": 5.339400468833427e-06, + "loss": 0.02, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.9060949683189392, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0204, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.3413904309272766, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0212, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.2620849311351776, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0201, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.3972470760345459, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0216, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.4422028064727783, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0177, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.2595955431461334, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0214, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.43522438406944275, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0226, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.33024686574935913, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0199, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.3532852232456207, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0194, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.3963644802570343, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0171, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.37003734707832336, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0174, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.27832016348838806, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0211, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.4203765392303467, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0196, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.31796127557754517, + "learning_rate": 5.233937303988081e-06, + "loss": 0.019, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.4561198949813843, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0198, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.4175209403038025, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0195, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.7017586827278137, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0201, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.4711352288722992, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.02, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.2737489640712738, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0198, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.44284430146217346, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0206, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.4556163251399994, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0208, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.3158712685108185, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0156, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.4620053172111511, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0187, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.7892107963562012, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0195, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.37334534525871277, + "learning_rate": 5.152002600477859e-06, + "loss": 0.02, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.4440039098262787, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0244, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.2650533616542816, + "learning_rate": 5.137194259935739e-06, + "loss": 0.017, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.5425522327423096, + "learning_rate": 5.129800405815733e-06, + "loss": 0.019, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.5764152407646179, + "learning_rate": 5.122413440701921e-06, + "loss": 0.018, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.3985585868358612, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0214, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.513511598110199, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0189, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.3784070909023285, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0164, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7029585242271423, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0201, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.28351524472236633, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0207, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.5500089526176453, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0222, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.35926392674446106, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0195, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.24845866858959198, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0198, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.3264683485031128, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0178, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.47955816984176636, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0206, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.31802570819854736, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0168, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.40685755014419556, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0223, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.4924621284008026, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0195, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.640724241733551, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0183, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.6712080836296082, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0196, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.34785783290863037, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0174, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.46851038932800293, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0186, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.6138949394226074, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0197, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.3083338439464569, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0179, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.3143295347690582, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0217, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.3330692946910858, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0149, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.2732333242893219, + "learning_rate": 4.961660586405147e-06, + "loss": 0.017, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.3350054621696472, + "learning_rate": 4.954434444590436e-06, + "loss": 0.022, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.2735322415828705, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0181, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.5919206738471985, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0201, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.28201058506965637, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0188, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.505592942237854, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0188, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.5231548547744751, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0184, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.3743092715740204, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0176, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.5908241271972656, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0224, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.4231952428817749, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0177, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.5666583180427551, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0218, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.4740161597728729, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0179, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.3947773873806, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.02, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.3114109933376312, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0223, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.44969403743743896, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0169, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.29602059721946716, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0168, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.3884619474411011, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0205, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.2929127514362335, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0149, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.4955149292945862, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0213, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.4021163582801819, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0192, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.2945493757724762, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.02, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34085726737976074, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0286, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.32751014828681946, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0226, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.3844929337501526, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0155, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.5286590456962585, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0229, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.26664429903030396, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0151, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.528367280960083, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0239, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5871155858039856, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0196, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.5686034560203552, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0184, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.40526366233825684, + "learning_rate": 4.755013723146175e-06, + "loss": 0.018, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.37055784463882446, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0184, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.5210561156272888, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0165, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.3386324942111969, + "learning_rate": 4.733984792194363e-06, + "loss": 0.018, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.40071168541908264, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0198, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.3415983319282532, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0168, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.3700709939002991, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0166, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.3559338450431824, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0174, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.5588265657424927, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0207, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.4539838433265686, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0164, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.34879690408706665, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0165, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.22862373292446136, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0158, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.5536275506019592, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0137, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5599532127380371, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0206, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.2961312532424927, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0138, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.5834526419639587, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0174, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.5941792726516724, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0205, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.2580801844596863, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0199, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3897567689418793, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0168, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.37937042117118835, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0213, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.3964179456233978, + "learning_rate": 4.616077433849538e-06, + "loss": 0.019, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.3632303476333618, + "learning_rate": 4.609208744970524e-06, + "loss": 0.015, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.5750122666358948, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0168, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.36310067772865295, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0172, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.5438339114189148, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0198, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.37394630908966064, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0202, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.2454962432384491, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0188, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.474844366312027, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0223, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.30256277322769165, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0188, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.500045657157898, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0179, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.609107494354248, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0182, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.20867787301540375, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0168, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.41653770208358765, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0172, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.357435941696167, + "learning_rate": 4.527371771040039e-06, + "loss": 0.017, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.5994096994400024, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0177, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.3150171935558319, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0164, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.4483601748943329, + "learning_rate": 4.507082898761475e-06, + "loss": 0.019, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.529812753200531, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0175, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.26758334040641785, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0187, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.3228643834590912, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0195, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.3437839150428772, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0207, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.28592896461486816, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0185, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.5544041991233826, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0191, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 1.0831762552261353, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0237, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.3546636700630188, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0203, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.32998642325401306, + "learning_rate": 4.446628604336844e-06, + "loss": 0.018, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.40987834334373474, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0189, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.6094655990600586, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0174, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.631481409072876, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0179, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.4069002866744995, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0192, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.36600202322006226, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0196, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.3092246353626251, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0185, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2811580300331116, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0162, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.4177345037460327, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0196, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.40211164951324463, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0199, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.31014713644981384, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0173, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.5378808379173279, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0189, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.3483606278896332, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0201, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.5112893581390381, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0209, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.26471400260925293, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.013, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.6770564317703247, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0174, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.4251134693622589, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0169, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.2985415458679199, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0212, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.4635870158672333, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0164, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.4360525906085968, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0174, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.6121042370796204, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0176, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.3049333095550537, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0162, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.46471482515335083, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0193, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.27093327045440674, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0204, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.3513331711292267, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0209, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.3452320396900177, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0172, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.44609951972961426, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0198, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.27217286825180054, + "learning_rate": 4.269026084410863e-06, + "loss": 0.016, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.5857428908348083, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0206, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.3834620714187622, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0165, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.34176892042160034, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0156, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.2497260719537735, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0183, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.3003418743610382, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0188, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.19922316074371338, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0162, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.5160003900527954, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0181, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.4917953312397003, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0197, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.2868032455444336, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0192, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.30980560183525085, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0178, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.31523144245147705, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0193, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.23731909692287445, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0171, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.4911767542362213, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0171, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.3095512390136719, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0165, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.6421821117401123, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0178, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.4887765645980835, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0212, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.4543951451778412, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0165, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.4595223367214203, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0144, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.6325511336326599, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0203, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.6220779418945312, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0225, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.3728989362716675, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0202, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.4958861470222473, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0204, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.32445529103279114, + "learning_rate": 4.122270968037107e-06, + "loss": 0.016, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.3969140350818634, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0174, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.39698946475982666, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0163, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.4633882939815521, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0179, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.36993899941444397, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0216, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.4137882590293884, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0187, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.320867121219635, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0238, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.3139745593070984, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0175, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.572628378868103, + "learning_rate": 4.072221948222934e-06, + "loss": 0.018, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.575975239276886, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0189, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.26301854848861694, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0121, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.3042408525943756, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0185, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.2503415644168854, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0208, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.3556166887283325, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0202, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.652975857257843, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0194, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.4215060770511627, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0166, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.2277296483516693, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0172, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.3370293378829956, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0201, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.4235946834087372, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0189, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 1.0387974977493286, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0176, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.7258256077766418, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0204, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.35412806272506714, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0165, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.5192556977272034, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0166, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.3292843699455261, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0163, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.46782153844833374, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0174, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.6324945092201233, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0183, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.4347882568836212, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0138, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.3393082320690155, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0155, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.28411221504211426, + "learning_rate": 3.949383948670156e-06, + "loss": 0.016, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.45982369780540466, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0134, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.32810381054878235, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0163, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.5996097922325134, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0246, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.40002167224884033, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0158, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.4102090299129486, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0179, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.44915929436683655, + "learning_rate": 3.913175335139808e-06, + "loss": 0.019, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.251206636428833, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0183, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.2564012408256531, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0182, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.431265652179718, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0177, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.42389997839927673, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0146, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.9380725622177124, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0206, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.3655669093132019, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0151, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.3248157501220703, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0152, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.5733596086502075, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0175, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.4672720730304718, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0185, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.22989575564861298, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0165, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 1.0956321954727173, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0181, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.39079031348228455, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0212, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.3974068760871887, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0197, + "step": 24000 + }, + { + "epoch": 1.4386721792797652, + "grad_norm": 1.1926871538162231, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0185, + "step": 24010 + }, + { + "epoch": 1.4392713763556833, + "grad_norm": 0.40923064947128296, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0171, + "step": 24020 + }, + { + "epoch": 1.4398705734316017, + "grad_norm": 0.38384920358657837, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0161, + "step": 24030 + }, + { + "epoch": 1.4404697705075198, + "grad_norm": 0.21791735291481018, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0168, + "step": 24040 + }, + { + "epoch": 1.4410689675834383, + "grad_norm": 0.3207184672355652, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0147, + "step": 24050 + }, + { + "epoch": 1.4416681646593565, + "grad_norm": 0.4831724166870117, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0182, + "step": 24060 + }, + { + "epoch": 1.4422673617352748, + "grad_norm": 0.47996360063552856, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0175, + "step": 24070 + }, + { + "epoch": 1.442866558811193, + "grad_norm": 0.41330286860466003, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0172, + "step": 24080 + }, + { + "epoch": 1.4434657558871113, + "grad_norm": 0.5012956857681274, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0234, + "step": 24090 + }, + { + "epoch": 1.4440649529630296, + "grad_norm": 0.4715912640094757, + "learning_rate": 3.777162510056721e-06, + "loss": 0.016, + "step": 24100 + }, + { + "epoch": 1.4446641500389479, + "grad_norm": 0.3817141652107239, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0158, + "step": 24110 + }, + { + "epoch": 1.4452633471148661, + "grad_norm": 0.3964484930038452, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0216, + "step": 24120 + }, + { + "epoch": 1.4458625441907844, + "grad_norm": 0.29786166548728943, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0197, + "step": 24130 + }, + { + "epoch": 1.4464617412667027, + "grad_norm": 0.2796359360218048, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.018, + "step": 24140 + }, + { + "epoch": 1.447060938342621, + "grad_norm": 0.30957916378974915, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0154, + "step": 24150 + }, + { + "epoch": 1.4476601354185392, + "grad_norm": 0.3837800920009613, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0171, + "step": 24160 + }, + { + "epoch": 1.4482593324944575, + "grad_norm": 0.29726749658584595, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0167, + "step": 24170 + }, + { + "epoch": 1.4488585295703758, + "grad_norm": 0.4624067544937134, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0187, + "step": 24180 + }, + { + "epoch": 1.449457726646294, + "grad_norm": 0.46996721625328064, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0156, + "step": 24190 + }, + { + "epoch": 1.4500569237222123, + "grad_norm": 0.351532518863678, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0221, + "step": 24200 + }, + { + "epoch": 1.4506561207981306, + "grad_norm": 0.5119938254356384, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0194, + "step": 24210 + }, + { + "epoch": 1.4512553178740488, + "grad_norm": 0.5102914571762085, + "learning_rate": 3.707974016467e-06, + "loss": 0.0152, + "step": 24220 + }, + { + "epoch": 1.451854514949967, + "grad_norm": 0.4638414680957794, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0167, + "step": 24230 + }, + { + "epoch": 1.4524537120258854, + "grad_norm": 0.6181433200836182, + "learning_rate": 3.696562092850226e-06, + "loss": 0.016, + "step": 24240 + }, + { + "epoch": 1.4530529091018036, + "grad_norm": 0.31810933351516724, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0175, + "step": 24250 + }, + { + "epoch": 1.453652106177722, + "grad_norm": 0.20725348591804504, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0139, + "step": 24260 + }, + { + "epoch": 1.4542513032536402, + "grad_norm": 0.29788675904273987, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0153, + "step": 24270 + }, + { + "epoch": 1.4548505003295584, + "grad_norm": 0.286422997713089, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0171, + "step": 24280 + }, + { + "epoch": 1.4554496974054767, + "grad_norm": 0.31199127435684204, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0183, + "step": 24290 + }, + { + "epoch": 1.456048894481395, + "grad_norm": 0.5850293040275574, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0199, + "step": 24300 + }, + { + "epoch": 1.4566480915573132, + "grad_norm": 0.5558650493621826, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0153, + "step": 24310 + }, + { + "epoch": 1.4572472886332315, + "grad_norm": 0.5221429467201233, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0159, + "step": 24320 + }, + { + "epoch": 1.4578464857091498, + "grad_norm": 0.40443119406700134, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0176, + "step": 24330 + }, + { + "epoch": 1.458445682785068, + "grad_norm": 0.4657982289791107, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0183, + "step": 24340 + }, + { + "epoch": 1.4590448798609863, + "grad_norm": 0.23784635961055756, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0156, + "step": 24350 + }, + { + "epoch": 1.4596440769369046, + "grad_norm": 0.3992721438407898, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0223, + "step": 24360 + }, + { + "epoch": 1.4602432740128228, + "grad_norm": 0.3949171304702759, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.014, + "step": 24370 + }, + { + "epoch": 1.460842471088741, + "grad_norm": 0.33738628029823303, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0157, + "step": 24380 + }, + { + "epoch": 1.4614416681646594, + "grad_norm": 0.42644673585891724, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0153, + "step": 24390 + }, + { + "epoch": 1.4620408652405776, + "grad_norm": 0.25812193751335144, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0173, + "step": 24400 + }, + { + "epoch": 1.462640062316496, + "grad_norm": 0.29154765605926514, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0162, + "step": 24410 + }, + { + "epoch": 1.4632392593924142, + "grad_norm": 0.3526030480861664, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0182, + "step": 24420 + }, + { + "epoch": 1.4638384564683324, + "grad_norm": 0.731890857219696, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0166, + "step": 24430 + }, + { + "epoch": 1.4644376535442507, + "grad_norm": 0.34727898240089417, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0146, + "step": 24440 + }, + { + "epoch": 1.465036850620169, + "grad_norm": 0.4517475962638855, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0229, + "step": 24450 + }, + { + "epoch": 1.4656360476960872, + "grad_norm": 0.3026634156703949, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0174, + "step": 24460 + }, + { + "epoch": 1.4662352447720055, + "grad_norm": 0.20546412467956543, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0177, + "step": 24470 + }, + { + "epoch": 1.4668344418479238, + "grad_norm": 0.47296327352523804, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0176, + "step": 24480 + }, + { + "epoch": 1.467433638923842, + "grad_norm": 0.4550913870334625, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0167, + "step": 24490 + }, + { + "epoch": 1.4680328359997603, + "grad_norm": 0.38641592860221863, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0144, + "step": 24500 + }, + { + "epoch": 1.4686320330756786, + "grad_norm": 0.23746857047080994, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0167, + "step": 24510 + }, + { + "epoch": 1.4692312301515968, + "grad_norm": 0.2114812433719635, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0167, + "step": 24520 + }, + { + "epoch": 1.4698304272275151, + "grad_norm": 0.41703343391418457, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.014, + "step": 24530 + }, + { + "epoch": 1.4704296243034334, + "grad_norm": 0.3279412090778351, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0158, + "step": 24540 + }, + { + "epoch": 1.4710288213793516, + "grad_norm": 0.41653862595558167, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0191, + "step": 24550 + }, + { + "epoch": 1.47162801845527, + "grad_norm": 0.5392111539840698, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0151, + "step": 24560 + }, + { + "epoch": 1.4722272155311882, + "grad_norm": 0.4654570519924164, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0179, + "step": 24570 + }, + { + "epoch": 1.4728264126071064, + "grad_norm": 0.5389031171798706, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0139, + "step": 24580 + }, + { + "epoch": 1.4734256096830247, + "grad_norm": 0.38597020506858826, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0142, + "step": 24590 + }, + { + "epoch": 1.474024806758943, + "grad_norm": 0.4820668399333954, + "learning_rate": 3.497061149826966e-06, + "loss": 0.015, + "step": 24600 + }, + { + "epoch": 1.4746240038348613, + "grad_norm": 0.36856982111930847, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0141, + "step": 24610 + }, + { + "epoch": 1.4752232009107795, + "grad_norm": 0.39727091789245605, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0173, + "step": 24620 + }, + { + "epoch": 1.4758223979866978, + "grad_norm": 0.29800575971603394, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.017, + "step": 24630 + }, + { + "epoch": 1.476421595062616, + "grad_norm": 0.6900123357772827, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0172, + "step": 24640 + }, + { + "epoch": 1.4770207921385343, + "grad_norm": 0.2665303647518158, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0157, + "step": 24650 + }, + { + "epoch": 1.4776199892144526, + "grad_norm": 0.3223106265068054, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.016, + "step": 24660 + }, + { + "epoch": 1.4782191862903709, + "grad_norm": 0.3684261739253998, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.02, + "step": 24670 + }, + { + "epoch": 1.4788183833662891, + "grad_norm": 0.38197198510169983, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0169, + "step": 24680 + }, + { + "epoch": 1.4794175804422074, + "grad_norm": 0.35841095447540283, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0175, + "step": 24690 + }, + { + "epoch": 1.4800167775181257, + "grad_norm": 0.4376572370529175, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0156, + "step": 24700 + }, + { + "epoch": 1.480615974594044, + "grad_norm": 0.5526829361915588, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0147, + "step": 24710 + }, + { + "epoch": 1.4812151716699622, + "grad_norm": 0.2922399938106537, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0152, + "step": 24720 + }, + { + "epoch": 1.4818143687458805, + "grad_norm": 0.4333120882511139, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0147, + "step": 24730 + }, + { + "epoch": 1.4824135658217987, + "grad_norm": 0.26118189096450806, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0166, + "step": 24740 + }, + { + "epoch": 1.483012762897717, + "grad_norm": 0.35313257575035095, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.016, + "step": 24750 + }, + { + "epoch": 1.4836119599736353, + "grad_norm": 0.29923367500305176, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0201, + "step": 24760 + }, + { + "epoch": 1.4842111570495535, + "grad_norm": 0.434772253036499, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0193, + "step": 24770 + }, + { + "epoch": 1.4848103541254718, + "grad_norm": 0.3422386646270752, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0148, + "step": 24780 + }, + { + "epoch": 1.48540955120139, + "grad_norm": 0.4303880035877228, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0197, + "step": 24790 + }, + { + "epoch": 1.4860087482773083, + "grad_norm": 0.4511156976222992, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0172, + "step": 24800 + }, + { + "epoch": 1.4866079453532266, + "grad_norm": 0.22014041244983673, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0146, + "step": 24810 + }, + { + "epoch": 1.4872071424291449, + "grad_norm": 0.4387083351612091, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0173, + "step": 24820 + }, + { + "epoch": 1.4878063395050631, + "grad_norm": 0.44642165303230286, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0175, + "step": 24830 + }, + { + "epoch": 1.4884055365809814, + "grad_norm": 0.39087313413619995, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0158, + "step": 24840 + }, + { + "epoch": 1.4890047336568997, + "grad_norm": 0.42447686195373535, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0133, + "step": 24850 + }, + { + "epoch": 1.4896039307328182, + "grad_norm": 0.43447887897491455, + "learning_rate": 3.36005636574796e-06, + "loss": 0.017, + "step": 24860 + }, + { + "epoch": 1.4902031278087362, + "grad_norm": 0.3336028754711151, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0159, + "step": 24870 + }, + { + "epoch": 1.4908023248846547, + "grad_norm": 0.3250858187675476, + "learning_rate": 3.349767211300933e-06, + "loss": 0.0169, + "step": 24880 + }, + { + "epoch": 1.4914015219605727, + "grad_norm": 0.2616746425628662, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0138, + "step": 24890 + }, + { + "epoch": 1.4920007190364912, + "grad_norm": 0.2752698063850403, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0144, + "step": 24900 + }, + { + "epoch": 1.4925999161124093, + "grad_norm": 0.28214627504348755, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0157, + "step": 24910 + }, + { + "epoch": 1.4931991131883278, + "grad_norm": 0.3839667737483978, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0153, + "step": 24920 + }, + { + "epoch": 1.4937983102642458, + "grad_norm": 0.29319512844085693, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0165, + "step": 24930 + }, + { + "epoch": 1.4943975073401643, + "grad_norm": 0.4219116270542145, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0173, + "step": 24940 + }, + { + "epoch": 1.4949967044160823, + "grad_norm": 0.4940520226955414, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0163, + "step": 24950 + }, + { + "epoch": 1.4955959014920008, + "grad_norm": 0.40064749121665955, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0146, + "step": 24960 + }, + { + "epoch": 1.4961950985679189, + "grad_norm": 0.33400869369506836, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0165, + "step": 24970 + }, + { + "epoch": 1.4967942956438374, + "grad_norm": 0.2474612295627594, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0154, + "step": 24980 + }, + { + "epoch": 1.4973934927197554, + "grad_norm": 0.32819071412086487, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0189, + "step": 24990 + }, + { + "epoch": 1.497992689795674, + "grad_norm": 0.32721251249313354, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0144, + "step": 25000 + }, + { + "epoch": 1.498591886871592, + "grad_norm": 0.4054602086544037, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.017, + "step": 25010 + }, + { + "epoch": 1.4991910839475104, + "grad_norm": 0.4691202938556671, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0188, + "step": 25020 + }, + { + "epoch": 1.4997902810234285, + "grad_norm": 0.9318768382072449, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0192, + "step": 25030 + }, + { + "epoch": 1.500389478099347, + "grad_norm": 0.25441330671310425, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0143, + "step": 25040 + }, + { + "epoch": 1.500988675175265, + "grad_norm": 0.3425164520740509, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0152, + "step": 25050 + }, + { + "epoch": 1.5015878722511835, + "grad_norm": 0.3809274733066559, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0154, + "step": 25060 + }, + { + "epoch": 1.5021870693271016, + "grad_norm": 0.2595506012439728, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0144, + "step": 25070 + }, + { + "epoch": 1.50278626640302, + "grad_norm": 0.29121503233909607, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0131, + "step": 25080 + }, + { + "epoch": 1.503385463478938, + "grad_norm": 0.2435981184244156, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0169, + "step": 25090 + }, + { + "epoch": 1.5039846605548566, + "grad_norm": 0.2967667579650879, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0146, + "step": 25100 + }, + { + "epoch": 1.5045838576307746, + "grad_norm": 0.2658415138721466, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0164, + "step": 25110 + }, + { + "epoch": 1.5051830547066931, + "grad_norm": 0.25294387340545654, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0137, + "step": 25120 + }, + { + "epoch": 1.5057822517826112, + "grad_norm": 0.4117964208126068, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0175, + "step": 25130 + }, + { + "epoch": 1.5063814488585296, + "grad_norm": 0.22604988515377045, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0187, + "step": 25140 + }, + { + "epoch": 1.5069806459344477, + "grad_norm": 0.2773517668247223, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0176, + "step": 25150 + }, + { + "epoch": 1.5075798430103662, + "grad_norm": 0.3213720917701721, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0181, + "step": 25160 + }, + { + "epoch": 1.5081790400862842, + "grad_norm": 0.3932463526725769, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0169, + "step": 25170 + }, + { + "epoch": 1.5087782371622027, + "grad_norm": 0.27642500400543213, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0177, + "step": 25180 + }, + { + "epoch": 1.5093774342381208, + "grad_norm": 0.4212909936904907, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0165, + "step": 25190 + }, + { + "epoch": 1.5099766313140393, + "grad_norm": 0.31928038597106934, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0159, + "step": 25200 + }, + { + "epoch": 1.5105758283899573, + "grad_norm": 0.31685909628868103, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0146, + "step": 25210 + }, + { + "epoch": 1.5111750254658758, + "grad_norm": 0.22591470181941986, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0142, + "step": 25220 + }, + { + "epoch": 1.5117742225417938, + "grad_norm": 0.22344504296779633, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0127, + "step": 25230 + }, + { + "epoch": 1.5123734196177123, + "grad_norm": 0.4538969099521637, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.0174, + "step": 25240 + }, + { + "epoch": 1.5129726166936306, + "grad_norm": 0.35422542691230774, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0169, + "step": 25250 + }, + { + "epoch": 1.5135718137695489, + "grad_norm": 0.41911551356315613, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0157, + "step": 25260 + }, + { + "epoch": 1.5141710108454671, + "grad_norm": 0.4679270088672638, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0147, + "step": 25270 + }, + { + "epoch": 1.5147702079213854, + "grad_norm": 0.29286396503448486, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0168, + "step": 25280 + }, + { + "epoch": 1.5153694049973037, + "grad_norm": 0.2840272784233093, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0182, + "step": 25290 + }, + { + "epoch": 1.515968602073222, + "grad_norm": 0.3369516432285309, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0159, + "step": 25300 + }, + { + "epoch": 1.5165677991491402, + "grad_norm": 0.36810392141342163, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.0207, + "step": 25310 + }, + { + "epoch": 1.5171669962250585, + "grad_norm": 0.30844470858573914, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.0151, + "step": 25320 + }, + { + "epoch": 1.5177661933009767, + "grad_norm": 0.22359415888786316, + "learning_rate": 3.127844986891409e-06, + "loss": 0.018, + "step": 25330 + }, + { + "epoch": 1.518365390376895, + "grad_norm": 0.42099806666374207, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0158, + "step": 25340 + }, + { + "epoch": 1.5189645874528133, + "grad_norm": 0.2903825342655182, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0157, + "step": 25350 + }, + { + "epoch": 1.5195637845287315, + "grad_norm": 0.33182457089424133, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0179, + "step": 25360 + }, + { + "epoch": 1.5201629816046498, + "grad_norm": 0.4607376158237457, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0189, + "step": 25370 + }, + { + "epoch": 1.520762178680568, + "grad_norm": 0.21630525588989258, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0156, + "step": 25380 + }, + { + "epoch": 1.5213613757564863, + "grad_norm": 0.38443559408187866, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0174, + "step": 25390 + }, + { + "epoch": 1.5219605728324046, + "grad_norm": 0.19618573784828186, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0126, + "step": 25400 + }, + { + "epoch": 1.5225597699083229, + "grad_norm": 0.4141467809677124, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0147, + "step": 25410 + }, + { + "epoch": 1.5231589669842411, + "grad_norm": 0.39915844798088074, + "learning_rate": 3.085688933413021e-06, + "loss": 0.0156, + "step": 25420 + }, + { + "epoch": 1.5237581640601594, + "grad_norm": 0.25136515498161316, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0147, + "step": 25430 + }, + { + "epoch": 1.5243573611360777, + "grad_norm": 0.30357712507247925, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0153, + "step": 25440 + }, + { + "epoch": 1.524956558211996, + "grad_norm": 0.37422874569892883, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0146, + "step": 25450 + }, + { + "epoch": 1.5255557552879142, + "grad_norm": 0.19593080878257751, + "learning_rate": 3.067194157156521e-06, + "loss": 0.0185, + "step": 25460 + }, + { + "epoch": 1.5261549523638325, + "grad_norm": 0.4984768033027649, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0159, + "step": 25470 + }, + { + "epoch": 1.5267541494397507, + "grad_norm": 0.35011765360832214, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0171, + "step": 25480 + }, + { + "epoch": 1.527353346515669, + "grad_norm": 0.43658894300460815, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.014, + "step": 25490 + }, + { + "epoch": 1.5279525435915873, + "grad_norm": 0.3372974693775177, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0205, + "step": 25500 + }, + { + "epoch": 1.5285517406675055, + "grad_norm": 0.2942260205745697, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0166, + "step": 25510 + }, + { + "epoch": 1.5291509377434238, + "grad_norm": 0.43129920959472656, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0167, + "step": 25520 + }, + { + "epoch": 1.529750134819342, + "grad_norm": 0.3023529648780823, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0167, + "step": 25530 + }, + { + "epoch": 1.5303493318952603, + "grad_norm": 0.298043429851532, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0185, + "step": 25540 + }, + { + "epoch": 1.5309485289711786, + "grad_norm": 0.2765754461288452, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0134, + "step": 25550 + }, + { + "epoch": 1.5315477260470969, + "grad_norm": 0.43460533022880554, + "learning_rate": 3.021609639602321e-06, + "loss": 0.014, + "step": 25560 + }, + { + "epoch": 1.5321469231230151, + "grad_norm": 0.2843260169029236, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0155, + "step": 25570 + }, + { + "epoch": 1.5327461201989334, + "grad_norm": 0.3337956964969635, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0164, + "step": 25580 + }, + { + "epoch": 1.5333453172748517, + "grad_norm": 0.4841095805168152, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0147, + "step": 25590 + }, + { + "epoch": 1.53394451435077, + "grad_norm": 0.31032758951187134, + "learning_rate": 3.003637700546652e-06, + "loss": 0.015, + "step": 25600 + }, + { + "epoch": 1.5345437114266882, + "grad_norm": 0.4080669581890106, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0151, + "step": 25610 + }, + { + "epoch": 1.5351429085026065, + "grad_norm": 0.23705625534057617, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0136, + "step": 25620 + }, + { + "epoch": 1.5357421055785248, + "grad_norm": 0.5293036103248596, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0195, + "step": 25630 + }, + { + "epoch": 1.536341302654443, + "grad_norm": 0.19166356325149536, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0118, + "step": 25640 + }, + { + "epoch": 1.5369404997303613, + "grad_norm": 0.35923510789871216, + "learning_rate": 2.981383959667165e-06, + "loss": 0.0153, + "step": 25650 + }, + { + "epoch": 1.5375396968062796, + "grad_norm": 0.525636613368988, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0169, + "step": 25660 + }, + { + "epoch": 1.5381388938821978, + "grad_norm": 0.3833159804344177, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0155, + "step": 25670 + }, + { + "epoch": 1.538738090958116, + "grad_norm": 0.30203381180763245, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0163, + "step": 25680 + }, + { + "epoch": 1.5393372880340344, + "grad_norm": 0.5735456347465515, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0158, + "step": 25690 + }, + { + "epoch": 1.5399364851099526, + "grad_norm": 0.4676662087440491, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0195, + "step": 25700 + }, + { + "epoch": 1.540535682185871, + "grad_norm": 0.29208818078041077, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0165, + "step": 25710 + }, + { + "epoch": 1.5411348792617892, + "grad_norm": 0.3703807294368744, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.015, + "step": 25720 + }, + { + "epoch": 1.5417340763377074, + "grad_norm": 0.5645684003829956, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0192, + "step": 25730 + }, + { + "epoch": 1.5423332734136257, + "grad_norm": 0.5154808759689331, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0154, + "step": 25740 + }, + { + "epoch": 1.542932470489544, + "grad_norm": 0.49836722016334534, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0161, + "step": 25750 + }, + { + "epoch": 1.5435316675654622, + "grad_norm": 0.4711974561214447, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0143, + "step": 25760 + }, + { + "epoch": 1.5441308646413805, + "grad_norm": 0.3468717932701111, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0151, + "step": 25770 + }, + { + "epoch": 1.5447300617172988, + "grad_norm": 0.3216229975223541, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0146, + "step": 25780 + }, + { + "epoch": 1.5453292587932173, + "grad_norm": 0.3436613976955414, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.0172, + "step": 25790 + }, + { + "epoch": 1.5459284558691353, + "grad_norm": 0.3601810336112976, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0166, + "step": 25800 + }, + { + "epoch": 1.5465276529450538, + "grad_norm": 0.2320292890071869, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0156, + "step": 25810 + }, + { + "epoch": 1.5471268500209718, + "grad_norm": 0.4563167989253998, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0168, + "step": 25820 + }, + { + "epoch": 1.5477260470968903, + "grad_norm": 0.33735397458076477, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0165, + "step": 25830 + }, + { + "epoch": 1.5483252441728084, + "grad_norm": 0.41785505414009094, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0179, + "step": 25840 + }, + { + "epoch": 1.5489244412487269, + "grad_norm": 0.41172194480895996, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.016, + "step": 25850 + }, + { + "epoch": 1.549523638324645, + "grad_norm": 0.4549838900566101, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0134, + "step": 25860 + }, + { + "epoch": 1.5501228354005634, + "grad_norm": 0.6315169930458069, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0176, + "step": 25870 + }, + { + "epoch": 1.5507220324764814, + "grad_norm": 0.43143466114997864, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0166, + "step": 25880 + }, + { + "epoch": 1.5513212295524, + "grad_norm": 0.4559693932533264, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0162, + "step": 25890 + }, + { + "epoch": 1.551920426628318, + "grad_norm": 0.3333865702152252, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0168, + "step": 25900 + }, + { + "epoch": 1.5525196237042365, + "grad_norm": 0.3939986526966095, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0146, + "step": 25910 + }, + { + "epoch": 1.5531188207801545, + "grad_norm": 0.35824787616729736, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0179, + "step": 25920 + }, + { + "epoch": 1.553718017856073, + "grad_norm": 0.40517401695251465, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0158, + "step": 25930 + }, + { + "epoch": 1.554317214931991, + "grad_norm": 0.41149890422821045, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0142, + "step": 25940 + }, + { + "epoch": 1.5549164120079095, + "grad_norm": 0.22149957716464996, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0153, + "step": 25950 + }, + { + "epoch": 1.5555156090838276, + "grad_norm": 0.2622004747390747, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0142, + "step": 25960 + }, + { + "epoch": 1.556114806159746, + "grad_norm": 0.3235580623149872, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.014, + "step": 25970 + }, + { + "epoch": 1.5567140032356641, + "grad_norm": 0.4349730312824249, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0148, + "step": 25980 + }, + { + "epoch": 1.5573132003115826, + "grad_norm": 0.30583831667900085, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0148, + "step": 25990 + }, + { + "epoch": 1.5579123973875006, + "grad_norm": 0.3436671495437622, + "learning_rate": 2.832230653119002e-06, + "loss": 0.015, + "step": 26000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.624037727047516e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-26000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f690e33538e4ea829d7f2f440de882604e055464 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69827123f17dbb980025c0c1136c2cc382bb67cfe9bdef71fb5e9b6580009e69 +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0d617d601348aa1a39f980428cfe926a9f1b6764 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4bca0db7f0a66516086bec5a848fa62bc1e78ae403f380315e61ac69ed87fac +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c0d42b05b2c9ab473453b85195b2825e22361422 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9302fce8ba51706fd74ec0db037887178b8f7691bfbf900776ee62dd85a91e1 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..abb6106415f4f5c7ba87584deb494eccf5bc135c --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/trainer_state.json @@ -0,0 +1,19634 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6777518125711546, + "eval_steps": 500, + "global_step": 28000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.7277513146400452, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0304, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.6415050029754639, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0292, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.48691895604133606, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0337, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.53068608045578, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0338, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.5464624762535095, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0303, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.3911614418029785, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0345, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.6894099116325378, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0365, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.5268317461013794, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0405, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.8635499477386475, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0321, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21542859077453613, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0264, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.6257337331771851, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0355, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.6525475978851318, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0304, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.4599299430847168, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0314, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.7497361898422241, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.031, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.3124896287918091, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0257, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.6170748472213745, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0323, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.4619428515434265, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0315, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.5088011026382446, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0255, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.5397948622703552, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0265, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.457082062959671, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0279, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.4131294786930084, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0269, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 1.1949660778045654, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.6057063341140747, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0306, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.26918280124664307, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0283, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.48841091990470886, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0323, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.6195886135101318, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0295, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.5798623561859131, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.031, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.4877539277076721, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0267, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.33261221647262573, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0261, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.8361077904701233, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0311, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.305922269821167, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0302, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.22662357985973358, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.028, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.4273515045642853, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0307, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.521216869354248, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0277, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.7090896368026733, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0346, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.3693661391735077, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.3651321530342102, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0263, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.5577923655509949, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0357, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.6504148840904236, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0404, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.49205282330513, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.035, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.6053458452224731, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0328, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.5949649214744568, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0302, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.5310356020927429, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0264, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4087911546230316, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.35929426550865173, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0274, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.5112904906272888, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0253, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.39148232340812683, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0305, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.47718697786331177, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.620936393737793, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0289, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.8953443169593811, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0328, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.4663226902484894, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0302, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.707167387008667, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0319, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.5325813889503479, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0318, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.6239158511161804, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0289, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.38823947310447693, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0266, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.48849165439605713, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0234, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.23214028775691986, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0276, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3467197120189667, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0282, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.2009357064962387, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0298, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.8589951395988464, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0264, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.43969056010246277, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0292, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.5750611424446106, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0289, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.5399556756019592, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0307, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.20517395436763763, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0249, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.7490189671516418, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0246, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.6661257743835449, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0325, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.571394681930542, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0342, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.8792482018470764, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0332, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.5770248770713806, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0286, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.62962406873703, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0246, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.4651380479335785, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.037, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.5087499022483826, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0265, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.44421979784965515, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0306, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.6521517038345337, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0334, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.5384942889213562, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0296, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.41909387707710266, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0297, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.6697047352790833, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0331, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.4015032947063446, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0326, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.48070228099823, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0278, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.8651071786880493, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0242, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 1.17703378200531, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0288, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.45865103602409363, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0322, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.41243845224380493, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0297, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.482997864484787, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0305, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.5319142937660217, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.6116752028465271, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0311, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.4214901328086853, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0269, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.6246733069419861, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.026, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.4263368248939514, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0305, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.4059041738510132, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.022, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.6362516283988953, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0265, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.2905973494052887, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0297, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.42270833253860474, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0255, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.26410749554634094, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0252, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.7570974230766296, + "learning_rate": 1.153689339251154e-05, + "loss": 0.027, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.5941224098205566, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.3985750079154968, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0337, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.3877560496330261, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.024, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.44742006063461304, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0284, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.3280893564224243, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0318, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0341, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.4976208806037903, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0239, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.6153465509414673, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0252, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.6112402677536011, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.4973732531070709, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0307, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.5871816277503967, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0254, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 1.2150986194610596, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.6406526565551758, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0265, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.4251798093318939, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0269, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.4702431857585907, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0311, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.3235304355621338, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0236, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.4913889467716217, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0231, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.4980977177619934, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0289, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.740922212600708, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0334, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.3305300772190094, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0301, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.7037357091903687, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0311, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.44783756136894226, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0339, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.7776843309402466, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0349, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.49181437492370605, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0285, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.333814799785614, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0284, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 1.203652262687683, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0365, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.521643877029419, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0313, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.33309581875801086, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0265, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.48567256331443787, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0357, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8473871946334839, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0355, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.43827518820762634, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0266, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.5849157571792603, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0317, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.5690399408340454, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0266, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.6484784483909607, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0294, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.8894811272621155, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0239, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4575272798538208, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0323, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.4288756847381592, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.032, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.8871303200721741, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.5861580967903137, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0335, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.4159319996833801, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0247, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.6948496699333191, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0299, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.5089551210403442, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0333, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.6912631392478943, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0303, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0295, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.4634060561656952, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0261, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.5664045214653015, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0262, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.7963227033615112, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0278, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.45378491282463074, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0268, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.8970746994018555, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0271, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.5109472274780273, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0307, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.5023297667503357, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0263, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.6055631041526794, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0285, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.38602766394615173, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0282, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.5447302460670471, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0319, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.6613780856132507, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0271, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 1.0358555316925049, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.026, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.4463629722595215, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.5373798608779907, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.025, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.7735916972160339, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0325, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.5017692446708679, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0262, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.3406142592430115, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0271, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.28971537947654724, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0238, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.45441415905952454, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0261, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.4653581976890564, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.026, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.5449947714805603, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0314, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.41015395522117615, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0272, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.5936392545700073, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0269, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0256, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.6176534295082092, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0285, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.6774734258651733, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0268, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.7045454978942871, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0305, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.5905448794364929, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0284, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.7881343364715576, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0321, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.6635507941246033, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0284, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.46298888325691223, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0394, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.5187172889709473, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0257, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.5974661707878113, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0305, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.5171123743057251, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0275, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.35988888144493103, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0295, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.30543047189712524, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0334, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.6582810878753662, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0309, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.4986134171485901, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0294, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.5560855269432068, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0224, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.28974607586860657, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0313, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.24015791714191437, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.026, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.2704199552536011, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0244, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.6661707162857056, + "learning_rate": 1.068904422762975e-05, + "loss": 0.027, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.5058556795120239, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0254, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.7086800336837769, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0242, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.6752822399139404, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0262, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.8279762268066406, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0312, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.5070614814758301, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0308, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.3933897614479065, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0287, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.37238794565200806, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0325, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.7591347098350525, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0265, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.4841652810573578, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0331, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.45236295461654663, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0412, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.4774094820022583, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0289, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.47564345598220825, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0294, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.341337651014328, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0281, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.341701865196228, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0224, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.6621959209442139, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0283, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.348466694355011, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0234, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.35208311676979065, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0248, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.4973156154155731, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0246, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.3668982982635498, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0228, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.4771873950958252, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0303, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.3595021665096283, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0265, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.6013099551200867, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0297, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.40996676683425903, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0321, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.45742037892341614, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0288, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.8092222213745117, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0278, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.32741186022758484, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0288, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.5716732740402222, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0256, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.3263239562511444, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0271, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.35390567779541016, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0266, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.36520150303840637, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0265, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.46227532625198364, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.40079647302627563, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0327, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.3689155578613281, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0249, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.49527907371520996, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.029, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.38931334018707275, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0233, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.5698918700218201, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0269, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 1.0959579944610596, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.029, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.6321646571159363, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0276, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.7166606783866882, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0292, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.6464444994926453, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0246, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.7318128347396851, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0296, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.4828032851219177, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0247, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.4509548842906952, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0241, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.413630872964859, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0313, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.42443349957466125, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0316, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.8199112415313721, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0389, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.28918105363845825, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0242, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.6759344339370728, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0308, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.5480250120162964, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.025, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.48897549510002136, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.027, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.6111220121383667, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0276, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.8852546215057373, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0251, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.5098162889480591, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.022, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.45974940061569214, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0206, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3925095200538635, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0251, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.5461363792419434, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0217, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.5685333609580994, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0231, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.494150310754776, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0243, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.8770614862442017, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0286, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.27142134308815, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0253, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.3365682363510132, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0241, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.5512370467185974, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0242, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.5581703782081604, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0276, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.306773841381073, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0262, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.44620928168296814, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0229, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.5870804786682129, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0228, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.26162099838256836, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0278, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.27250319719314575, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0293, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.8330137729644775, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0315, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.5206989645957947, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0282, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.5408382415771484, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0359, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.30517199635505676, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0267, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.5315027236938477, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0206, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.46061626076698303, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0222, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.47393080592155457, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0262, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.3686772882938385, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0254, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.3312757611274719, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0243, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.565447986125946, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0267, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.5690101385116577, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0237, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.44088438153266907, + "learning_rate": 9.911670744652783e-06, + "loss": 0.028, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.3708919882774353, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0265, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.589698851108551, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0297, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.6541375517845154, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0288, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.5304558873176575, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0243, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.5774737000465393, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0277, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.5616280436515808, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0267, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.6129759550094604, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0223, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.45278221368789673, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0304, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.44487202167510986, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0296, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.5391712188720703, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0256, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.43523359298706055, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0277, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.5308435559272766, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0242, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.3361283540725708, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0236, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.3764631450176239, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0304, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.9003425240516663, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0278, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.2787775993347168, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0219, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.40089285373687744, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0284, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.3619711101055145, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0252, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.7354542016983032, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0242, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.3854006826877594, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0302, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.3318389058113098, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0265, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.5286651849746704, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0235, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.24921932816505432, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0259, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.7376067042350769, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0238, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.35099226236343384, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0257, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.3805389702320099, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0198, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.4433703124523163, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0241, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.3667793571949005, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0268, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.2963331639766693, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0223, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.9817414879798889, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0248, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.6529688835144043, + "learning_rate": 9.612315882780393e-06, + "loss": 0.032, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.7663154602050781, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0267, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.6086964011192322, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0281, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.5240464806556702, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0339, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.6558368802070618, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0284, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.6192268133163452, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0309, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.5293763875961304, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0257, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.38831329345703125, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0239, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.12827467918396, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0323, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.411818265914917, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0274, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.5521355867385864, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0233, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.26673075556755066, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0317, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.5205486416816711, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0273, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.8010990619659424, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0292, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.420612633228302, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0274, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.4811270236968994, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0277, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.4959382712841034, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0288, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.4607725739479065, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0245, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.9101414680480957, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0283, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.38626620173454285, + "learning_rate": 9.42959233811777e-06, + "loss": 0.026, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.5709372758865356, + "learning_rate": 9.419993062475743e-06, + "loss": 0.021, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.4417913854122162, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0291, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.5651213526725769, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0228, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.4716165363788605, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0242, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.9120892286300659, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0296, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.5004292130470276, + "learning_rate": 9.372024722887089e-06, + "loss": 0.033, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.3422714173793793, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0284, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.5391610264778137, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0362, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.5446203351020813, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0247, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.5441875457763672, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0284, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.48274070024490356, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0245, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.6035326719284058, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0226, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.3104001581668854, + "learning_rate": 9.304949604077693e-06, + "loss": 0.029, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.27859869599342346, + "learning_rate": 9.295375311262483e-06, + "loss": 0.022, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.3896406292915344, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0235, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.4526473581790924, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0289, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.6624506115913391, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0265, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.6976125836372375, + "learning_rate": 9.257098257046206e-06, + "loss": 0.029, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.5974310040473938, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0205, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.7627739906311035, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0333, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.3166525065898895, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0309, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.41519322991371155, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0223, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.31840237975120544, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0239, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.47412827610969543, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0228, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.41170552372932434, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0209, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.45858854055404663, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0243, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.7870534658432007, + "learning_rate": 9.171095634265995e-06, + "loss": 0.027, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.4080354869365692, + "learning_rate": 9.161550369445782e-06, + "loss": 0.023, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.47916823625564575, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0303, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.6911760568618774, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0263, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.3980148732662201, + "learning_rate": 9.132927564918328e-06, + "loss": 0.028, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.47085851430892944, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0266, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.5085862874984741, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0239, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.5219245553016663, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0267, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.5199264287948608, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0277, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.6157195568084717, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0343, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.5366696715354919, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0271, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.3640076220035553, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0258, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.5320505499839783, + "learning_rate": 9.05669731553499e-06, + "loss": 0.024, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.507826566696167, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0253, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.741392195224762, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0242, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.5325136184692383, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0224, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.4709665775299072, + "learning_rate": 9.018636566864313e-06, + "loss": 0.026, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.4371986985206604, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0264, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.47594818472862244, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0224, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.488423228263855, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0261, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.24745763838291168, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0206, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.5042629837989807, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0305, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.5255836844444275, + "learning_rate": 8.961615424107555e-06, + "loss": 0.026, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.4605107307434082, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0274, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.3252561390399933, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0227, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.35779184103012085, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0296, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.2960403263568878, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0212, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.6344659328460693, + "learning_rate": 8.914163487132906e-06, + "loss": 0.026, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.4614463150501251, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0234, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4490053951740265, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0265, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.5291271209716797, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0326, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.5311887264251709, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0257, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.5647584199905396, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0295, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.3913862705230713, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0256, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.4476219415664673, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0248, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.7807655930519104, + "learning_rate": 8.83836825410936e-06, + "loss": 0.026, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.38984328508377075, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0247, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.5757346153259277, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0296, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.25636178255081177, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0222, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.45617344975471497, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0224, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.3066493272781372, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0237, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.26513972878456116, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0277, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.445230633020401, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0248, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.4914413392543793, + "learning_rate": 8.762735374981932e-06, + "loss": 0.022, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.41469570994377136, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0245, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.33235347270965576, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0229, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.4890037775039673, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0247, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.41330578923225403, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0285, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.6309427618980408, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0233, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.42090296745300293, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0254, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.5888519287109375, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0262, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.5488774180412292, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0262, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.48015111684799194, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0219, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.4484168291091919, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0276, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.4128018319606781, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0218, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.5151517987251282, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0242, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.6248350143432617, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0267, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.4116908013820648, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0242, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.6138579249382019, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0282, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.22843605279922485, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0284, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.49555841088294983, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0244, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.5752411484718323, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0275, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.5129706859588623, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0237, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.751230001449585, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0257, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.47749435901641846, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0277, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21702095866203308, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0255, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.30658838152885437, + "learning_rate": 8.54624657467318e-06, + "loss": 0.024, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.3589625954627991, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0215, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.5434426069259644, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0224, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.8732438683509827, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0289, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.34988290071487427, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0226, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.4021032154560089, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0248, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.4676196873188019, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0235, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.41646474599838257, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0235, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.5892519950866699, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0221, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.5757095217704773, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0258, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.4664652645587921, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0275, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.4674879014492035, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0285, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.7277936339378357, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0316, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.40373867750167847, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0213, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.8632686138153076, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0239, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.5620945692062378, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0259, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3430384695529938, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0287, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.46981969475746155, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0218, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.3494231700897217, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0238, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.514975368976593, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0205, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.6442168951034546, + "learning_rate": 8.359228888944986e-06, + "loss": 0.021, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.32178881764411926, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0219, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.48865941166877747, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0261, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.6131434440612793, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0269, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.4471806585788727, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0251, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.8255780935287476, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0229, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.843673586845398, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0278, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.4278610348701477, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0228, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.5036011338233948, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0291, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.5141382813453674, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0217, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.8976346850395203, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0248, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.5634751319885254, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0276, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.5327013731002808, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0279, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.2723959982395172, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0225, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.4455258846282959, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0222, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.3784103989601135, + "learning_rate": 8.219774325200873e-06, + "loss": 0.024, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.8102694749832153, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0231, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.5179240703582764, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0255, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.39830490946769714, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0264, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.32860279083251953, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0241, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.5459582209587097, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0193, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.3841477036476135, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0282, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.7849119305610657, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0319, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.4457703232765198, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0279, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.30464428663253784, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0184, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 1.0635287761688232, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0265, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.33294421434402466, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0235, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.5644985437393188, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0218, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.4975566565990448, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0261, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.7503839135169983, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0218, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.35363277792930603, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0198, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.43968406319618225, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0253, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.4553394615650177, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0266, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.45489153265953064, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0264, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.424696147441864, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0209, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.4819740653038025, + "learning_rate": 8.03498318084394e-06, + "loss": 0.022, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.564834475517273, + "learning_rate": 8.025779439806006e-06, + "loss": 0.024, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.7905157804489136, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0261, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.6985124349594116, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0315, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.42378291487693787, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0237, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.5980759263038635, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0217, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.45916232466697693, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0235, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.25486481189727783, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0231, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.4072360694408417, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0261, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3813820481300354, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0209, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.3040210008621216, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0225, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.30910906195640564, + "learning_rate": 7.933935782312965e-06, + "loss": 0.026, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.6573566794395447, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0262, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.30632153153419495, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0251, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.3277539610862732, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0233, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.49589917063713074, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0211, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.4149130880832672, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0203, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.7051926851272583, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0272, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.8553881049156189, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0236, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.5676615238189697, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0242, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.29548707604408264, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0236, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.36076608300209045, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0219, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.3657922148704529, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0227, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.27593615651130676, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0251, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.35554730892181396, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0259, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.45652297139167786, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0274, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.5757999420166016, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0222, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.5138059854507446, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0216, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.338874876499176, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0232, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.48215195536613464, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0226, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.30239933729171753, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0205, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.6099343299865723, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0219, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.6730902791023254, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0239, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.4575020968914032, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0204, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.2673267424106598, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0222, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.3593531548976898, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0225, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.5385488867759705, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0248, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.3900541663169861, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0277, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.6182276010513306, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0241, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.4897976815700531, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0229, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.5717247128486633, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0273, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.4837515950202942, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0219, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.31954509019851685, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0271, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.23005163669586182, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0204, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.500217616558075, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0229, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.47326523065567017, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0203, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.5074726939201355, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0249, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.6583673357963562, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0243, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.7585731744766235, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0264, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.3782348036766052, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0216, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.43963512778282166, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0201, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.6450467109680176, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0254, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.3420482575893402, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0224, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.3532888889312744, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0216, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.32494598627090454, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0196, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.2898419499397278, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0234, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.4379838705062866, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0233, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.5390518307685852, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0169, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.3786150813102722, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0203, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.3376149833202362, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0266, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.40810349583625793, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0241, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.24485738575458527, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0199, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.4670563340187073, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0184, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.374255508184433, + "learning_rate": 7.4623904967312e-06, + "loss": 0.018, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.4191536605358124, + "learning_rate": 7.453427567620127e-06, + "loss": 0.022, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3807078003883362, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0232, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.7537381649017334, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0202, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.36507129669189453, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0236, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.24461498856544495, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0221, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.351654589176178, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0236, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.35627686977386475, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0213, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.4586603343486786, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0304, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.4082098603248596, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0237, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.47707459330558777, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0247, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.4687316119670868, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0344, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.4660017788410187, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0214, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.4644101560115814, + "learning_rate": 7.346200065486093e-06, + "loss": 0.022, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.3139079213142395, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0234, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.36445188522338867, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0262, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.6457782983779907, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0261, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.4184044599533081, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0245, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.44356703758239746, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0215, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.5394402742385864, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0302, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 0.5960429906845093, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0234, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2850514352321625, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0243, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.45071718096733093, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0233, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.3157344162464142, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0254, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.45518410205841064, + "learning_rate": 7.248450164740439e-06, + "loss": 0.024, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.2323702722787857, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0226, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.6025039553642273, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0246, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.4983830749988556, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0199, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.3684524595737457, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0252, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.36924007534980774, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0277, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.3531496822834015, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0228, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.3995579779148102, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0193, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.4124946892261505, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0221, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.3897329866886139, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0221, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.45230787992477417, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0238, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.45878538489341736, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0244, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.4302407503128052, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0237, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.30422642827033997, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0173, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.49566513299942017, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0201, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.43262094259262085, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0227, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.8250450491905212, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0259, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.3265332281589508, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0205, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.2871774435043335, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0201, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.4341558814048767, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0199, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.43365293741226196, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0201, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.5876246690750122, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0205, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.2719171643257141, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0211, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.38791123032569885, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0244, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.4082484543323517, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0206, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.5010205507278442, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0245, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.4404369294643402, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0268, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.5171347856521606, + "learning_rate": 7.010805483338283e-06, + "loss": 0.024, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.5137951970100403, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0241, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.563709557056427, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0193, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.44687238335609436, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0207, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.33815798163414, + "learning_rate": 6.975884226362e-06, + "loss": 0.0246, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.33789384365081787, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0206, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.38053908944129944, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0195, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.5730066299438477, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0199, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.42453598976135254, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0218, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.48010921478271484, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0328, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.5227254629135132, + "learning_rate": 6.923644220932124e-06, + "loss": 0.019, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4078599810600281, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0212, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.4473094046115875, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0281, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.3459968864917755, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0231, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.4205886721611023, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0256, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.5397320985794067, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0214, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.6208626627922058, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0224, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.34377506375312805, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0197, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.4086950123310089, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0202, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.5211176872253418, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0201, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3705415725708008, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0219, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.32692769169807434, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0204, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.42599135637283325, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0213, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.565449595451355, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0223, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.4027825593948364, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0233, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.4833034574985504, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0309, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.5570312738418579, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0213, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.30241742730140686, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0197, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.37468239665031433, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0214, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.5555301904678345, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0223, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.6084730625152588, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0261, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.5931955575942993, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0237, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.30350545048713684, + "learning_rate": 6.733587654719298e-06, + "loss": 0.02, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.6784055233001709, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0281, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.5559973120689392, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0204, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.7529487013816833, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0235, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.7032052874565125, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0176, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.5018401741981506, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0197, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.5020368695259094, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0231, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.3605690598487854, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0254, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.3482762575149536, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0223, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.4260469675064087, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0199, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.23622000217437744, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0239, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.3683573007583618, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0223, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.32972025871276855, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0228, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.4159783124923706, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0221, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.24288412928581238, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0188, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.42375463247299194, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0183, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.26672226190567017, + "learning_rate": 6.596880604028027e-06, + "loss": 0.02, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.30816635489463806, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0219, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.315452516078949, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0218, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.5412175059318542, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0233, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.4290241003036499, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0233, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.3977762460708618, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0239, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.4023628532886505, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0197, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.8707197308540344, + "learning_rate": 6.53748481975927e-06, + "loss": 0.029, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.37878328561782837, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0218, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.685556173324585, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0248, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.5783588886260986, + "learning_rate": 6.512107839793337e-06, + "loss": 0.02, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.5456825494766235, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0279, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.6162738800048828, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0259, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.38887348771095276, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0198, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.5207514762878418, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0201, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.671120822429657, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0259, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.28870952129364014, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0175, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.3909374177455902, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0214, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.3419650197029114, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0217, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.563515305519104, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0185, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.6295453310012817, + "learning_rate": 6.427861749601945e-06, + "loss": 0.023, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.4404713213443756, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0188, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.698448121547699, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0225, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.5679222941398621, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0213, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.5237470269203186, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0261, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.4205586016178131, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0232, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.36608314514160156, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.02, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.49511757493019104, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0247, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.3475521206855774, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0202, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.36345914006233215, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0197, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.34304162859916687, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0183, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.41459065675735474, + "learning_rate": 6.335811156758245e-06, + "loss": 0.02, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.34139952063560486, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0211, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.29463231563568115, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0225, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.37984198331832886, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0201, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.21912901103496552, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0226, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.34660178422927856, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0179, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.6080809235572815, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0187, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.43388310074806213, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0226, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.53389972448349, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0237, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.39731428027153015, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0176, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.32715681195259094, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0211, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.36709150671958923, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0194, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.5554866790771484, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0202, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.26253199577331543, + "learning_rate": 6.227878992893104e-06, + "loss": 0.02, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.3686104714870453, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0191, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.36151114106178284, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0213, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.5019435882568359, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0203, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 1.1914043426513672, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0249, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.45042529702186584, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0244, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.3239169120788574, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0219, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.3253174424171448, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0226, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.6497724652290344, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0238, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.5800855159759521, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0211, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.29717954993247986, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0198, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.35056066513061523, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0219, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.28448906540870667, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0227, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.33300310373306274, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0165, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.5134487748146057, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0219, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.45153549313545227, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0191, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.6483689546585083, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0211, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.5660327076911926, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0207, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.6027820706367493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0201, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.6102983951568604, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0207, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.4383072257041931, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0275, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.42298370599746704, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0204, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.30508092045783997, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0195, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.6242369413375854, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0215, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.38399502635002136, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0201, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.4721924066543579, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0243, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.6958035230636597, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0201, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.3826717436313629, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0236, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.3098534941673279, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0216, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.43973061442375183, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0234, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.46570682525634766, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0226, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.46847036480903625, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0188, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.5139725804328918, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0195, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.48436662554740906, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0206, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.3445553481578827, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0241, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.8473356366157532, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0248, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.6241415143013, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0242, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.7302873730659485, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0224, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.29269692301750183, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0181, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.4065910577774048, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0253, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.36930134892463684, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0203, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.5521696209907532, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0208, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.3761119544506073, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0209, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.3330603241920471, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0233, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.27771884202957153, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0162, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.4225069284439087, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0177, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.33680275082588196, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0199, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.4399181604385376, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0236, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.49677175283432007, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0265, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.39700835943222046, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0193, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.4604041278362274, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0208, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.26002946496009827, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0197, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.3256632685661316, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0192, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3573099672794342, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0184, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.3116256892681122, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0197, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.39247608184814453, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0219, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.31291085481643677, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0194, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.5996116399765015, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0264, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.24854864180088043, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0207, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.5746667385101318, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0195, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.5744135975837708, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0182, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.5161272883415222, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0212, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.5889247059822083, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0172, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.53412926197052, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0209, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.3421672582626343, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0193, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.409906268119812, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0173, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.5139239430427551, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0198, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.5014253258705139, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0177, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.5942979454994202, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0206, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.218281552195549, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0204, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.43725427985191345, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0215, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.3467969000339508, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0168, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.2697127163410187, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0214, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.43687018752098083, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0262, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.47759339213371277, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0212, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.33211249113082886, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0228, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.29453045129776, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0233, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.34539318084716797, + "learning_rate": 5.608700869895367e-06, + "loss": 0.021, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.6664339900016785, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0203, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.21404555439949036, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0209, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.4320753812789917, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0236, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.415399968624115, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0235, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.2643829584121704, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0203, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.4354988932609558, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0172, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.43992263078689575, + "learning_rate": 5.554208267666996e-06, + "loss": 0.018, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.32208460569381714, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0183, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.27261701226234436, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0196, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.4348963499069214, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0173, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.40379852056503296, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0202, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.4592876136302948, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0219, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.4797484278678894, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0182, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.47892817854881287, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0185, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.46308979392051697, + "learning_rate": 5.492314644463202e-06, + "loss": 0.018, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.7745133638381958, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0207, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.6577957272529602, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0166, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.43036580085754395, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0218, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.41811347007751465, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0214, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.31980884075164795, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0198, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.3632652461528778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0209, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.467146635055542, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0173, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.5659807920455933, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0199, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.24540813267230988, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0178, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.3122001588344574, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0222, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2879388928413391, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0173, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.5185259580612183, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0168, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.239187091588974, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0198, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3844532370567322, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0179, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.3842040002346039, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0204, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.26496851444244385, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0172, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.40850451588630676, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0189, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21669425070285797, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0192, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.43664559721946716, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.021, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.49064821004867554, + "learning_rate": 5.339400468833427e-06, + "loss": 0.02, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.9060949683189392, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0204, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.3413904309272766, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0212, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.2620849311351776, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0201, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.3972470760345459, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0216, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.4422028064727783, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0177, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.2595955431461334, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0214, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.43522438406944275, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0226, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.33024686574935913, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0199, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.3532852232456207, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0194, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.3963644802570343, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0171, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.37003734707832336, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0174, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.27832016348838806, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0211, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.4203765392303467, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0196, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.31796127557754517, + "learning_rate": 5.233937303988081e-06, + "loss": 0.019, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.4561198949813843, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0198, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.4175209403038025, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0195, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.7017586827278137, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0201, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.4711352288722992, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.02, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.2737489640712738, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0198, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.44284430146217346, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0206, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.4556163251399994, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0208, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.3158712685108185, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0156, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.4620053172111511, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0187, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.7892107963562012, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0195, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.37334534525871277, + "learning_rate": 5.152002600477859e-06, + "loss": 0.02, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.4440039098262787, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0244, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.2650533616542816, + "learning_rate": 5.137194259935739e-06, + "loss": 0.017, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.5425522327423096, + "learning_rate": 5.129800405815733e-06, + "loss": 0.019, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.5764152407646179, + "learning_rate": 5.122413440701921e-06, + "loss": 0.018, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.3985585868358612, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0214, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.513511598110199, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0189, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.3784070909023285, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0164, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7029585242271423, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0201, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.28351524472236633, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0207, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.5500089526176453, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0222, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.35926392674446106, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0195, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.24845866858959198, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0198, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.3264683485031128, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0178, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.47955816984176636, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0206, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.31802570819854736, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0168, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.40685755014419556, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0223, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.4924621284008026, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0195, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.640724241733551, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0183, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.6712080836296082, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0196, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.34785783290863037, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0174, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.46851038932800293, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0186, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.6138949394226074, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0197, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.3083338439464569, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0179, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.3143295347690582, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0217, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.3330692946910858, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0149, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.2732333242893219, + "learning_rate": 4.961660586405147e-06, + "loss": 0.017, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.3350054621696472, + "learning_rate": 4.954434444590436e-06, + "loss": 0.022, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.2735322415828705, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0181, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.5919206738471985, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0201, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.28201058506965637, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0188, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.505592942237854, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0188, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.5231548547744751, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0184, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.3743092715740204, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0176, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.5908241271972656, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0224, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.4231952428817749, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0177, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.5666583180427551, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0218, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.4740161597728729, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0179, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.3947773873806, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.02, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.3114109933376312, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0223, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.44969403743743896, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0169, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.29602059721946716, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0168, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.3884619474411011, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0205, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.2929127514362335, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0149, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.4955149292945862, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0213, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.4021163582801819, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0192, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.2945493757724762, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.02, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34085726737976074, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0286, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.32751014828681946, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0226, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.3844929337501526, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0155, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.5286590456962585, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0229, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.26664429903030396, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0151, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.528367280960083, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0239, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5871155858039856, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0196, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.5686034560203552, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0184, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.40526366233825684, + "learning_rate": 4.755013723146175e-06, + "loss": 0.018, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.37055784463882446, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0184, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.5210561156272888, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0165, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.3386324942111969, + "learning_rate": 4.733984792194363e-06, + "loss": 0.018, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.40071168541908264, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0198, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.3415983319282532, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0168, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.3700709939002991, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0166, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.3559338450431824, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0174, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.5588265657424927, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0207, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.4539838433265686, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0164, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.34879690408706665, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0165, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.22862373292446136, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0158, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.5536275506019592, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0137, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5599532127380371, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0206, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.2961312532424927, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0138, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.5834526419639587, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0174, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.5941792726516724, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0205, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.2580801844596863, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0199, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3897567689418793, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0168, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.37937042117118835, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0213, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.3964179456233978, + "learning_rate": 4.616077433849538e-06, + "loss": 0.019, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.3632303476333618, + "learning_rate": 4.609208744970524e-06, + "loss": 0.015, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.5750122666358948, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0168, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.36310067772865295, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0172, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.5438339114189148, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0198, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.37394630908966064, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0202, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.2454962432384491, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0188, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.474844366312027, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0223, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.30256277322769165, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0188, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.500045657157898, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0179, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.609107494354248, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0182, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.20867787301540375, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0168, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.41653770208358765, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0172, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.357435941696167, + "learning_rate": 4.527371771040039e-06, + "loss": 0.017, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.5994096994400024, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0177, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.3150171935558319, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0164, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.4483601748943329, + "learning_rate": 4.507082898761475e-06, + "loss": 0.019, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.529812753200531, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0175, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.26758334040641785, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0187, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.3228643834590912, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0195, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.3437839150428772, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0207, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.28592896461486816, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0185, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.5544041991233826, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0191, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 1.0831762552261353, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0237, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.3546636700630188, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0203, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.32998642325401306, + "learning_rate": 4.446628604336844e-06, + "loss": 0.018, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.40987834334373474, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0189, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.6094655990600586, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0174, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.631481409072876, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0179, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.4069002866744995, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0192, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.36600202322006226, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0196, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.3092246353626251, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0185, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2811580300331116, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0162, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.4177345037460327, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0196, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.40211164951324463, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0199, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.31014713644981384, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0173, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.5378808379173279, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0189, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.3483606278896332, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0201, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.5112893581390381, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0209, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.26471400260925293, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.013, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.6770564317703247, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0174, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.4251134693622589, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0169, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.2985415458679199, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0212, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.4635870158672333, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0164, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.4360525906085968, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0174, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.6121042370796204, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0176, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.3049333095550537, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0162, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.46471482515335083, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0193, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.27093327045440674, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0204, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.3513331711292267, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0209, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.3452320396900177, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0172, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.44609951972961426, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0198, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.27217286825180054, + "learning_rate": 4.269026084410863e-06, + "loss": 0.016, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.5857428908348083, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0206, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.3834620714187622, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0165, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.34176892042160034, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0156, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.2497260719537735, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0183, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.3003418743610382, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0188, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.19922316074371338, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0162, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.5160003900527954, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0181, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.4917953312397003, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0197, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.2868032455444336, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0192, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.30980560183525085, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0178, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.31523144245147705, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0193, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.23731909692287445, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0171, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.4911767542362213, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0171, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.3095512390136719, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0165, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.6421821117401123, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0178, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.4887765645980835, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0212, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.4543951451778412, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0165, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.4595223367214203, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0144, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.6325511336326599, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0203, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.6220779418945312, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0225, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.3728989362716675, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0202, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.4958861470222473, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0204, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.32445529103279114, + "learning_rate": 4.122270968037107e-06, + "loss": 0.016, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.3969140350818634, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0174, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.39698946475982666, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0163, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.4633882939815521, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0179, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.36993899941444397, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0216, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.4137882590293884, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0187, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.320867121219635, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0238, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.3139745593070984, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0175, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.572628378868103, + "learning_rate": 4.072221948222934e-06, + "loss": 0.018, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.575975239276886, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0189, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.26301854848861694, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0121, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.3042408525943756, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0185, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.2503415644168854, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0208, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.3556166887283325, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0202, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.652975857257843, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0194, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.4215060770511627, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0166, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.2277296483516693, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0172, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.3370293378829956, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0201, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.4235946834087372, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0189, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 1.0387974977493286, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0176, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.7258256077766418, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0204, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.35412806272506714, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0165, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.5192556977272034, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0166, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.3292843699455261, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0163, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.46782153844833374, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0174, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.6324945092201233, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0183, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.4347882568836212, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0138, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.3393082320690155, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0155, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.28411221504211426, + "learning_rate": 3.949383948670156e-06, + "loss": 0.016, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.45982369780540466, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0134, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.32810381054878235, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0163, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.5996097922325134, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0246, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.40002167224884033, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0158, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.4102090299129486, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0179, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.44915929436683655, + "learning_rate": 3.913175335139808e-06, + "loss": 0.019, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.251206636428833, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0183, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.2564012408256531, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0182, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.431265652179718, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0177, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.42389997839927673, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0146, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.9380725622177124, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0206, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.3655669093132019, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0151, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.3248157501220703, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0152, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.5733596086502075, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0175, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.4672720730304718, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0185, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.22989575564861298, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0165, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 1.0956321954727173, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0181, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.39079031348228455, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0212, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.3974068760871887, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0197, + "step": 24000 + }, + { + "epoch": 1.4386721792797652, + "grad_norm": 1.1926871538162231, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0185, + "step": 24010 + }, + { + "epoch": 1.4392713763556833, + "grad_norm": 0.40923064947128296, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0171, + "step": 24020 + }, + { + "epoch": 1.4398705734316017, + "grad_norm": 0.38384920358657837, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0161, + "step": 24030 + }, + { + "epoch": 1.4404697705075198, + "grad_norm": 0.21791735291481018, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0168, + "step": 24040 + }, + { + "epoch": 1.4410689675834383, + "grad_norm": 0.3207184672355652, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0147, + "step": 24050 + }, + { + "epoch": 1.4416681646593565, + "grad_norm": 0.4831724166870117, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0182, + "step": 24060 + }, + { + "epoch": 1.4422673617352748, + "grad_norm": 0.47996360063552856, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0175, + "step": 24070 + }, + { + "epoch": 1.442866558811193, + "grad_norm": 0.41330286860466003, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0172, + "step": 24080 + }, + { + "epoch": 1.4434657558871113, + "grad_norm": 0.5012956857681274, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0234, + "step": 24090 + }, + { + "epoch": 1.4440649529630296, + "grad_norm": 0.4715912640094757, + "learning_rate": 3.777162510056721e-06, + "loss": 0.016, + "step": 24100 + }, + { + "epoch": 1.4446641500389479, + "grad_norm": 0.3817141652107239, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0158, + "step": 24110 + }, + { + "epoch": 1.4452633471148661, + "grad_norm": 0.3964484930038452, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0216, + "step": 24120 + }, + { + "epoch": 1.4458625441907844, + "grad_norm": 0.29786166548728943, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0197, + "step": 24130 + }, + { + "epoch": 1.4464617412667027, + "grad_norm": 0.2796359360218048, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.018, + "step": 24140 + }, + { + "epoch": 1.447060938342621, + "grad_norm": 0.30957916378974915, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0154, + "step": 24150 + }, + { + "epoch": 1.4476601354185392, + "grad_norm": 0.3837800920009613, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0171, + "step": 24160 + }, + { + "epoch": 1.4482593324944575, + "grad_norm": 0.29726749658584595, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0167, + "step": 24170 + }, + { + "epoch": 1.4488585295703758, + "grad_norm": 0.4624067544937134, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0187, + "step": 24180 + }, + { + "epoch": 1.449457726646294, + "grad_norm": 0.46996721625328064, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0156, + "step": 24190 + }, + { + "epoch": 1.4500569237222123, + "grad_norm": 0.351532518863678, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0221, + "step": 24200 + }, + { + "epoch": 1.4506561207981306, + "grad_norm": 0.5119938254356384, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0194, + "step": 24210 + }, + { + "epoch": 1.4512553178740488, + "grad_norm": 0.5102914571762085, + "learning_rate": 3.707974016467e-06, + "loss": 0.0152, + "step": 24220 + }, + { + "epoch": 1.451854514949967, + "grad_norm": 0.4638414680957794, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0167, + "step": 24230 + }, + { + "epoch": 1.4524537120258854, + "grad_norm": 0.6181433200836182, + "learning_rate": 3.696562092850226e-06, + "loss": 0.016, + "step": 24240 + }, + { + "epoch": 1.4530529091018036, + "grad_norm": 0.31810933351516724, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0175, + "step": 24250 + }, + { + "epoch": 1.453652106177722, + "grad_norm": 0.20725348591804504, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0139, + "step": 24260 + }, + { + "epoch": 1.4542513032536402, + "grad_norm": 0.29788675904273987, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0153, + "step": 24270 + }, + { + "epoch": 1.4548505003295584, + "grad_norm": 0.286422997713089, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0171, + "step": 24280 + }, + { + "epoch": 1.4554496974054767, + "grad_norm": 0.31199127435684204, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0183, + "step": 24290 + }, + { + "epoch": 1.456048894481395, + "grad_norm": 0.5850293040275574, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0199, + "step": 24300 + }, + { + "epoch": 1.4566480915573132, + "grad_norm": 0.5558650493621826, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0153, + "step": 24310 + }, + { + "epoch": 1.4572472886332315, + "grad_norm": 0.5221429467201233, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0159, + "step": 24320 + }, + { + "epoch": 1.4578464857091498, + "grad_norm": 0.40443119406700134, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0176, + "step": 24330 + }, + { + "epoch": 1.458445682785068, + "grad_norm": 0.4657982289791107, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0183, + "step": 24340 + }, + { + "epoch": 1.4590448798609863, + "grad_norm": 0.23784635961055756, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0156, + "step": 24350 + }, + { + "epoch": 1.4596440769369046, + "grad_norm": 0.3992721438407898, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0223, + "step": 24360 + }, + { + "epoch": 1.4602432740128228, + "grad_norm": 0.3949171304702759, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.014, + "step": 24370 + }, + { + "epoch": 1.460842471088741, + "grad_norm": 0.33738628029823303, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0157, + "step": 24380 + }, + { + "epoch": 1.4614416681646594, + "grad_norm": 0.42644673585891724, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0153, + "step": 24390 + }, + { + "epoch": 1.4620408652405776, + "grad_norm": 0.25812193751335144, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0173, + "step": 24400 + }, + { + "epoch": 1.462640062316496, + "grad_norm": 0.29154765605926514, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0162, + "step": 24410 + }, + { + "epoch": 1.4632392593924142, + "grad_norm": 0.3526030480861664, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0182, + "step": 24420 + }, + { + "epoch": 1.4638384564683324, + "grad_norm": 0.731890857219696, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0166, + "step": 24430 + }, + { + "epoch": 1.4644376535442507, + "grad_norm": 0.34727898240089417, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0146, + "step": 24440 + }, + { + "epoch": 1.465036850620169, + "grad_norm": 0.4517475962638855, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0229, + "step": 24450 + }, + { + "epoch": 1.4656360476960872, + "grad_norm": 0.3026634156703949, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0174, + "step": 24460 + }, + { + "epoch": 1.4662352447720055, + "grad_norm": 0.20546412467956543, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0177, + "step": 24470 + }, + { + "epoch": 1.4668344418479238, + "grad_norm": 0.47296327352523804, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0176, + "step": 24480 + }, + { + "epoch": 1.467433638923842, + "grad_norm": 0.4550913870334625, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0167, + "step": 24490 + }, + { + "epoch": 1.4680328359997603, + "grad_norm": 0.38641592860221863, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0144, + "step": 24500 + }, + { + "epoch": 1.4686320330756786, + "grad_norm": 0.23746857047080994, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0167, + "step": 24510 + }, + { + "epoch": 1.4692312301515968, + "grad_norm": 0.2114812433719635, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0167, + "step": 24520 + }, + { + "epoch": 1.4698304272275151, + "grad_norm": 0.41703343391418457, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.014, + "step": 24530 + }, + { + "epoch": 1.4704296243034334, + "grad_norm": 0.3279412090778351, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0158, + "step": 24540 + }, + { + "epoch": 1.4710288213793516, + "grad_norm": 0.41653862595558167, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0191, + "step": 24550 + }, + { + "epoch": 1.47162801845527, + "grad_norm": 0.5392111539840698, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0151, + "step": 24560 + }, + { + "epoch": 1.4722272155311882, + "grad_norm": 0.4654570519924164, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0179, + "step": 24570 + }, + { + "epoch": 1.4728264126071064, + "grad_norm": 0.5389031171798706, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0139, + "step": 24580 + }, + { + "epoch": 1.4734256096830247, + "grad_norm": 0.38597020506858826, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0142, + "step": 24590 + }, + { + "epoch": 1.474024806758943, + "grad_norm": 0.4820668399333954, + "learning_rate": 3.497061149826966e-06, + "loss": 0.015, + "step": 24600 + }, + { + "epoch": 1.4746240038348613, + "grad_norm": 0.36856982111930847, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0141, + "step": 24610 + }, + { + "epoch": 1.4752232009107795, + "grad_norm": 0.39727091789245605, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0173, + "step": 24620 + }, + { + "epoch": 1.4758223979866978, + "grad_norm": 0.29800575971603394, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.017, + "step": 24630 + }, + { + "epoch": 1.476421595062616, + "grad_norm": 0.6900123357772827, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0172, + "step": 24640 + }, + { + "epoch": 1.4770207921385343, + "grad_norm": 0.2665303647518158, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0157, + "step": 24650 + }, + { + "epoch": 1.4776199892144526, + "grad_norm": 0.3223106265068054, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.016, + "step": 24660 + }, + { + "epoch": 1.4782191862903709, + "grad_norm": 0.3684261739253998, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.02, + "step": 24670 + }, + { + "epoch": 1.4788183833662891, + "grad_norm": 0.38197198510169983, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0169, + "step": 24680 + }, + { + "epoch": 1.4794175804422074, + "grad_norm": 0.35841095447540283, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0175, + "step": 24690 + }, + { + "epoch": 1.4800167775181257, + "grad_norm": 0.4376572370529175, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0156, + "step": 24700 + }, + { + "epoch": 1.480615974594044, + "grad_norm": 0.5526829361915588, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0147, + "step": 24710 + }, + { + "epoch": 1.4812151716699622, + "grad_norm": 0.2922399938106537, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0152, + "step": 24720 + }, + { + "epoch": 1.4818143687458805, + "grad_norm": 0.4333120882511139, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0147, + "step": 24730 + }, + { + "epoch": 1.4824135658217987, + "grad_norm": 0.26118189096450806, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0166, + "step": 24740 + }, + { + "epoch": 1.483012762897717, + "grad_norm": 0.35313257575035095, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.016, + "step": 24750 + }, + { + "epoch": 1.4836119599736353, + "grad_norm": 0.29923367500305176, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0201, + "step": 24760 + }, + { + "epoch": 1.4842111570495535, + "grad_norm": 0.434772253036499, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0193, + "step": 24770 + }, + { + "epoch": 1.4848103541254718, + "grad_norm": 0.3422386646270752, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0148, + "step": 24780 + }, + { + "epoch": 1.48540955120139, + "grad_norm": 0.4303880035877228, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0197, + "step": 24790 + }, + { + "epoch": 1.4860087482773083, + "grad_norm": 0.4511156976222992, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0172, + "step": 24800 + }, + { + "epoch": 1.4866079453532266, + "grad_norm": 0.22014041244983673, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0146, + "step": 24810 + }, + { + "epoch": 1.4872071424291449, + "grad_norm": 0.4387083351612091, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0173, + "step": 24820 + }, + { + "epoch": 1.4878063395050631, + "grad_norm": 0.44642165303230286, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0175, + "step": 24830 + }, + { + "epoch": 1.4884055365809814, + "grad_norm": 0.39087313413619995, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0158, + "step": 24840 + }, + { + "epoch": 1.4890047336568997, + "grad_norm": 0.42447686195373535, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0133, + "step": 24850 + }, + { + "epoch": 1.4896039307328182, + "grad_norm": 0.43447887897491455, + "learning_rate": 3.36005636574796e-06, + "loss": 0.017, + "step": 24860 + }, + { + "epoch": 1.4902031278087362, + "grad_norm": 0.3336028754711151, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0159, + "step": 24870 + }, + { + "epoch": 1.4908023248846547, + "grad_norm": 0.3250858187675476, + "learning_rate": 3.349767211300933e-06, + "loss": 0.0169, + "step": 24880 + }, + { + "epoch": 1.4914015219605727, + "grad_norm": 0.2616746425628662, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0138, + "step": 24890 + }, + { + "epoch": 1.4920007190364912, + "grad_norm": 0.2752698063850403, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0144, + "step": 24900 + }, + { + "epoch": 1.4925999161124093, + "grad_norm": 0.28214627504348755, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0157, + "step": 24910 + }, + { + "epoch": 1.4931991131883278, + "grad_norm": 0.3839667737483978, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0153, + "step": 24920 + }, + { + "epoch": 1.4937983102642458, + "grad_norm": 0.29319512844085693, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0165, + "step": 24930 + }, + { + "epoch": 1.4943975073401643, + "grad_norm": 0.4219116270542145, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0173, + "step": 24940 + }, + { + "epoch": 1.4949967044160823, + "grad_norm": 0.4940520226955414, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0163, + "step": 24950 + }, + { + "epoch": 1.4955959014920008, + "grad_norm": 0.40064749121665955, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0146, + "step": 24960 + }, + { + "epoch": 1.4961950985679189, + "grad_norm": 0.33400869369506836, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0165, + "step": 24970 + }, + { + "epoch": 1.4967942956438374, + "grad_norm": 0.2474612295627594, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0154, + "step": 24980 + }, + { + "epoch": 1.4973934927197554, + "grad_norm": 0.32819071412086487, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0189, + "step": 24990 + }, + { + "epoch": 1.497992689795674, + "grad_norm": 0.32721251249313354, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0144, + "step": 25000 + }, + { + "epoch": 1.498591886871592, + "grad_norm": 0.4054602086544037, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.017, + "step": 25010 + }, + { + "epoch": 1.4991910839475104, + "grad_norm": 0.4691202938556671, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0188, + "step": 25020 + }, + { + "epoch": 1.4997902810234285, + "grad_norm": 0.9318768382072449, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0192, + "step": 25030 + }, + { + "epoch": 1.500389478099347, + "grad_norm": 0.25441330671310425, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0143, + "step": 25040 + }, + { + "epoch": 1.500988675175265, + "grad_norm": 0.3425164520740509, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0152, + "step": 25050 + }, + { + "epoch": 1.5015878722511835, + "grad_norm": 0.3809274733066559, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0154, + "step": 25060 + }, + { + "epoch": 1.5021870693271016, + "grad_norm": 0.2595506012439728, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0144, + "step": 25070 + }, + { + "epoch": 1.50278626640302, + "grad_norm": 0.29121503233909607, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0131, + "step": 25080 + }, + { + "epoch": 1.503385463478938, + "grad_norm": 0.2435981184244156, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0169, + "step": 25090 + }, + { + "epoch": 1.5039846605548566, + "grad_norm": 0.2967667579650879, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0146, + "step": 25100 + }, + { + "epoch": 1.5045838576307746, + "grad_norm": 0.2658415138721466, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0164, + "step": 25110 + }, + { + "epoch": 1.5051830547066931, + "grad_norm": 0.25294387340545654, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0137, + "step": 25120 + }, + { + "epoch": 1.5057822517826112, + "grad_norm": 0.4117964208126068, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0175, + "step": 25130 + }, + { + "epoch": 1.5063814488585296, + "grad_norm": 0.22604988515377045, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0187, + "step": 25140 + }, + { + "epoch": 1.5069806459344477, + "grad_norm": 0.2773517668247223, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0176, + "step": 25150 + }, + { + "epoch": 1.5075798430103662, + "grad_norm": 0.3213720917701721, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0181, + "step": 25160 + }, + { + "epoch": 1.5081790400862842, + "grad_norm": 0.3932463526725769, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0169, + "step": 25170 + }, + { + "epoch": 1.5087782371622027, + "grad_norm": 0.27642500400543213, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0177, + "step": 25180 + }, + { + "epoch": 1.5093774342381208, + "grad_norm": 0.4212909936904907, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0165, + "step": 25190 + }, + { + "epoch": 1.5099766313140393, + "grad_norm": 0.31928038597106934, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0159, + "step": 25200 + }, + { + "epoch": 1.5105758283899573, + "grad_norm": 0.31685909628868103, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0146, + "step": 25210 + }, + { + "epoch": 1.5111750254658758, + "grad_norm": 0.22591470181941986, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0142, + "step": 25220 + }, + { + "epoch": 1.5117742225417938, + "grad_norm": 0.22344504296779633, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0127, + "step": 25230 + }, + { + "epoch": 1.5123734196177123, + "grad_norm": 0.4538969099521637, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.0174, + "step": 25240 + }, + { + "epoch": 1.5129726166936306, + "grad_norm": 0.35422542691230774, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0169, + "step": 25250 + }, + { + "epoch": 1.5135718137695489, + "grad_norm": 0.41911551356315613, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0157, + "step": 25260 + }, + { + "epoch": 1.5141710108454671, + "grad_norm": 0.4679270088672638, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0147, + "step": 25270 + }, + { + "epoch": 1.5147702079213854, + "grad_norm": 0.29286396503448486, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0168, + "step": 25280 + }, + { + "epoch": 1.5153694049973037, + "grad_norm": 0.2840272784233093, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0182, + "step": 25290 + }, + { + "epoch": 1.515968602073222, + "grad_norm": 0.3369516432285309, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0159, + "step": 25300 + }, + { + "epoch": 1.5165677991491402, + "grad_norm": 0.36810392141342163, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.0207, + "step": 25310 + }, + { + "epoch": 1.5171669962250585, + "grad_norm": 0.30844470858573914, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.0151, + "step": 25320 + }, + { + "epoch": 1.5177661933009767, + "grad_norm": 0.22359415888786316, + "learning_rate": 3.127844986891409e-06, + "loss": 0.018, + "step": 25330 + }, + { + "epoch": 1.518365390376895, + "grad_norm": 0.42099806666374207, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0158, + "step": 25340 + }, + { + "epoch": 1.5189645874528133, + "grad_norm": 0.2903825342655182, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0157, + "step": 25350 + }, + { + "epoch": 1.5195637845287315, + "grad_norm": 0.33182457089424133, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0179, + "step": 25360 + }, + { + "epoch": 1.5201629816046498, + "grad_norm": 0.4607376158237457, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0189, + "step": 25370 + }, + { + "epoch": 1.520762178680568, + "grad_norm": 0.21630525588989258, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0156, + "step": 25380 + }, + { + "epoch": 1.5213613757564863, + "grad_norm": 0.38443559408187866, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0174, + "step": 25390 + }, + { + "epoch": 1.5219605728324046, + "grad_norm": 0.19618573784828186, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0126, + "step": 25400 + }, + { + "epoch": 1.5225597699083229, + "grad_norm": 0.4141467809677124, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0147, + "step": 25410 + }, + { + "epoch": 1.5231589669842411, + "grad_norm": 0.39915844798088074, + "learning_rate": 3.085688933413021e-06, + "loss": 0.0156, + "step": 25420 + }, + { + "epoch": 1.5237581640601594, + "grad_norm": 0.25136515498161316, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0147, + "step": 25430 + }, + { + "epoch": 1.5243573611360777, + "grad_norm": 0.30357712507247925, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0153, + "step": 25440 + }, + { + "epoch": 1.524956558211996, + "grad_norm": 0.37422874569892883, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0146, + "step": 25450 + }, + { + "epoch": 1.5255557552879142, + "grad_norm": 0.19593080878257751, + "learning_rate": 3.067194157156521e-06, + "loss": 0.0185, + "step": 25460 + }, + { + "epoch": 1.5261549523638325, + "grad_norm": 0.4984768033027649, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0159, + "step": 25470 + }, + { + "epoch": 1.5267541494397507, + "grad_norm": 0.35011765360832214, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0171, + "step": 25480 + }, + { + "epoch": 1.527353346515669, + "grad_norm": 0.43658894300460815, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.014, + "step": 25490 + }, + { + "epoch": 1.5279525435915873, + "grad_norm": 0.3372974693775177, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0205, + "step": 25500 + }, + { + "epoch": 1.5285517406675055, + "grad_norm": 0.2942260205745697, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0166, + "step": 25510 + }, + { + "epoch": 1.5291509377434238, + "grad_norm": 0.43129920959472656, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0167, + "step": 25520 + }, + { + "epoch": 1.529750134819342, + "grad_norm": 0.3023529648780823, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0167, + "step": 25530 + }, + { + "epoch": 1.5303493318952603, + "grad_norm": 0.298043429851532, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0185, + "step": 25540 + }, + { + "epoch": 1.5309485289711786, + "grad_norm": 0.2765754461288452, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0134, + "step": 25550 + }, + { + "epoch": 1.5315477260470969, + "grad_norm": 0.43460533022880554, + "learning_rate": 3.021609639602321e-06, + "loss": 0.014, + "step": 25560 + }, + { + "epoch": 1.5321469231230151, + "grad_norm": 0.2843260169029236, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0155, + "step": 25570 + }, + { + "epoch": 1.5327461201989334, + "grad_norm": 0.3337956964969635, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0164, + "step": 25580 + }, + { + "epoch": 1.5333453172748517, + "grad_norm": 0.4841095805168152, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0147, + "step": 25590 + }, + { + "epoch": 1.53394451435077, + "grad_norm": 0.31032758951187134, + "learning_rate": 3.003637700546652e-06, + "loss": 0.015, + "step": 25600 + }, + { + "epoch": 1.5345437114266882, + "grad_norm": 0.4080669581890106, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0151, + "step": 25610 + }, + { + "epoch": 1.5351429085026065, + "grad_norm": 0.23705625534057617, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0136, + "step": 25620 + }, + { + "epoch": 1.5357421055785248, + "grad_norm": 0.5293036103248596, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0195, + "step": 25630 + }, + { + "epoch": 1.536341302654443, + "grad_norm": 0.19166356325149536, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0118, + "step": 25640 + }, + { + "epoch": 1.5369404997303613, + "grad_norm": 0.35923510789871216, + "learning_rate": 2.981383959667165e-06, + "loss": 0.0153, + "step": 25650 + }, + { + "epoch": 1.5375396968062796, + "grad_norm": 0.525636613368988, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0169, + "step": 25660 + }, + { + "epoch": 1.5381388938821978, + "grad_norm": 0.3833159804344177, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0155, + "step": 25670 + }, + { + "epoch": 1.538738090958116, + "grad_norm": 0.30203381180763245, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0163, + "step": 25680 + }, + { + "epoch": 1.5393372880340344, + "grad_norm": 0.5735456347465515, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0158, + "step": 25690 + }, + { + "epoch": 1.5399364851099526, + "grad_norm": 0.4676662087440491, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0195, + "step": 25700 + }, + { + "epoch": 1.540535682185871, + "grad_norm": 0.29208818078041077, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0165, + "step": 25710 + }, + { + "epoch": 1.5411348792617892, + "grad_norm": 0.3703807294368744, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.015, + "step": 25720 + }, + { + "epoch": 1.5417340763377074, + "grad_norm": 0.5645684003829956, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0192, + "step": 25730 + }, + { + "epoch": 1.5423332734136257, + "grad_norm": 0.5154808759689331, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0154, + "step": 25740 + }, + { + "epoch": 1.542932470489544, + "grad_norm": 0.49836722016334534, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0161, + "step": 25750 + }, + { + "epoch": 1.5435316675654622, + "grad_norm": 0.4711974561214447, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0143, + "step": 25760 + }, + { + "epoch": 1.5441308646413805, + "grad_norm": 0.3468717932701111, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0151, + "step": 25770 + }, + { + "epoch": 1.5447300617172988, + "grad_norm": 0.3216229975223541, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0146, + "step": 25780 + }, + { + "epoch": 1.5453292587932173, + "grad_norm": 0.3436613976955414, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.0172, + "step": 25790 + }, + { + "epoch": 1.5459284558691353, + "grad_norm": 0.3601810336112976, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0166, + "step": 25800 + }, + { + "epoch": 1.5465276529450538, + "grad_norm": 0.2320292890071869, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0156, + "step": 25810 + }, + { + "epoch": 1.5471268500209718, + "grad_norm": 0.4563167989253998, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0168, + "step": 25820 + }, + { + "epoch": 1.5477260470968903, + "grad_norm": 0.33735397458076477, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0165, + "step": 25830 + }, + { + "epoch": 1.5483252441728084, + "grad_norm": 0.41785505414009094, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0179, + "step": 25840 + }, + { + "epoch": 1.5489244412487269, + "grad_norm": 0.41172194480895996, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.016, + "step": 25850 + }, + { + "epoch": 1.549523638324645, + "grad_norm": 0.4549838900566101, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0134, + "step": 25860 + }, + { + "epoch": 1.5501228354005634, + "grad_norm": 0.6315169930458069, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0176, + "step": 25870 + }, + { + "epoch": 1.5507220324764814, + "grad_norm": 0.43143466114997864, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0166, + "step": 25880 + }, + { + "epoch": 1.5513212295524, + "grad_norm": 0.4559693932533264, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0162, + "step": 25890 + }, + { + "epoch": 1.551920426628318, + "grad_norm": 0.3333865702152252, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0168, + "step": 25900 + }, + { + "epoch": 1.5525196237042365, + "grad_norm": 0.3939986526966095, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0146, + "step": 25910 + }, + { + "epoch": 1.5531188207801545, + "grad_norm": 0.35824787616729736, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0179, + "step": 25920 + }, + { + "epoch": 1.553718017856073, + "grad_norm": 0.40517401695251465, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0158, + "step": 25930 + }, + { + "epoch": 1.554317214931991, + "grad_norm": 0.41149890422821045, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0142, + "step": 25940 + }, + { + "epoch": 1.5549164120079095, + "grad_norm": 0.22149957716464996, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0153, + "step": 25950 + }, + { + "epoch": 1.5555156090838276, + "grad_norm": 0.2622004747390747, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0142, + "step": 25960 + }, + { + "epoch": 1.556114806159746, + "grad_norm": 0.3235580623149872, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.014, + "step": 25970 + }, + { + "epoch": 1.5567140032356641, + "grad_norm": 0.4349730312824249, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0148, + "step": 25980 + }, + { + "epoch": 1.5573132003115826, + "grad_norm": 0.30583831667900085, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0148, + "step": 25990 + }, + { + "epoch": 1.5579123973875006, + "grad_norm": 0.3436671495437622, + "learning_rate": 2.832230653119002e-06, + "loss": 0.015, + "step": 26000 + }, + { + "epoch": 1.5585115944634191, + "grad_norm": 0.23681265115737915, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0169, + "step": 26010 + }, + { + "epoch": 1.5591107915393372, + "grad_norm": 0.2916300892829895, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.0145, + "step": 26020 + }, + { + "epoch": 1.5597099886152557, + "grad_norm": 0.4516601264476776, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.0168, + "step": 26030 + }, + { + "epoch": 1.5603091856911737, + "grad_norm": 0.25640442967414856, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0145, + "step": 26040 + }, + { + "epoch": 1.5609083827670922, + "grad_norm": 0.3058616816997528, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.0134, + "step": 26050 + }, + { + "epoch": 1.5615075798430103, + "grad_norm": 0.37286022305488586, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0133, + "step": 26060 + }, + { + "epoch": 1.5621067769189287, + "grad_norm": 0.2570302486419678, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.0136, + "step": 26070 + }, + { + "epoch": 1.5627059739948468, + "grad_norm": 0.5596319437026978, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0184, + "step": 26080 + }, + { + "epoch": 1.5633051710707653, + "grad_norm": 0.36270666122436523, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.0158, + "step": 26090 + }, + { + "epoch": 1.5639043681466833, + "grad_norm": 0.4473365247249603, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0146, + "step": 26100 + }, + { + "epoch": 1.5645035652226018, + "grad_norm": 0.256773978471756, + "learning_rate": 2.78776903555923e-06, + "loss": 0.0141, + "step": 26110 + }, + { + "epoch": 1.5651027622985199, + "grad_norm": 0.3173777759075165, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.0155, + "step": 26120 + }, + { + "epoch": 1.5657019593744383, + "grad_norm": 0.39649754762649536, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0171, + "step": 26130 + }, + { + "epoch": 1.5663011564503564, + "grad_norm": 0.8298602104187012, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.0181, + "step": 26140 + }, + { + "epoch": 1.5669003535262749, + "grad_norm": 0.41698411107063293, + "learning_rate": 2.771889969647e-06, + "loss": 0.0155, + "step": 26150 + }, + { + "epoch": 1.567499550602193, + "grad_norm": 0.3315671384334564, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0142, + "step": 26160 + }, + { + "epoch": 1.5680987476781114, + "grad_norm": 0.27380600571632385, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0146, + "step": 26170 + }, + { + "epoch": 1.5686979447540295, + "grad_norm": 0.2785346210002899, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0174, + "step": 26180 + }, + { + "epoch": 1.569297141829948, + "grad_norm": 0.46294671297073364, + "learning_rate": 2.756165401834933e-06, + "loss": 0.0177, + "step": 26190 + }, + { + "epoch": 1.569896338905866, + "grad_norm": 0.3026588559150696, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.016, + "step": 26200 + }, + { + "epoch": 1.5704955359817845, + "grad_norm": 0.335443377494812, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0138, + "step": 26210 + }, + { + "epoch": 1.5710947330577025, + "grad_norm": 0.26176130771636963, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0182, + "step": 26220 + }, + { + "epoch": 1.571693930133621, + "grad_norm": 0.41030630469322205, + "learning_rate": 2.740595627381096e-06, + "loss": 0.0157, + "step": 26230 + }, + { + "epoch": 1.572293127209539, + "grad_norm": 0.25381243228912354, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0126, + "step": 26240 + }, + { + "epoch": 1.5728923242854576, + "grad_norm": 0.3790159821510315, + "learning_rate": 2.732868879137055e-06, + "loss": 0.0138, + "step": 26250 + }, + { + "epoch": 1.5734915213613756, + "grad_norm": 0.3830420672893524, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.0134, + "step": 26260 + }, + { + "epoch": 1.574090718437294, + "grad_norm": 0.534146785736084, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0151, + "step": 26270 + }, + { + "epoch": 1.5746899155132121, + "grad_norm": 0.5088993310928345, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0188, + "step": 26280 + }, + { + "epoch": 1.5752891125891306, + "grad_norm": 0.271245539188385, + "learning_rate": 2.717531841969889e-06, + "loss": 0.015, + "step": 26290 + }, + { + "epoch": 1.5758883096650487, + "grad_norm": 0.7041701078414917, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0205, + "step": 26300 + }, + { + "epoch": 1.5764875067409672, + "grad_norm": 1.5670353174209595, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0246, + "step": 26310 + }, + { + "epoch": 1.5770867038168854, + "grad_norm": 0.3782089054584503, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.0145, + "step": 26320 + }, + { + "epoch": 1.5776859008928037, + "grad_norm": 0.2301669716835022, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0132, + "step": 26330 + }, + { + "epoch": 1.578285097968722, + "grad_norm": 0.4629409611225128, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.013, + "step": 26340 + }, + { + "epoch": 1.5788842950446402, + "grad_norm": 0.2709483802318573, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0154, + "step": 26350 + }, + { + "epoch": 1.5794834921205585, + "grad_norm": 0.31532853841781616, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.016, + "step": 26360 + }, + { + "epoch": 1.5800826891964768, + "grad_norm": 0.350920170545578, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.016, + "step": 26370 + }, + { + "epoch": 1.580681886272395, + "grad_norm": 0.5954864025115967, + "learning_rate": 2.683592557860616e-06, + "loss": 0.0178, + "step": 26380 + }, + { + "epoch": 1.5812810833483133, + "grad_norm": 0.4362819492816925, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.017, + "step": 26390 + }, + { + "epoch": 1.5818802804242316, + "grad_norm": 0.2640637755393982, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.0146, + "step": 26400 + }, + { + "epoch": 1.5824794775001498, + "grad_norm": 0.475008100271225, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0134, + "step": 26410 + }, + { + "epoch": 1.583078674576068, + "grad_norm": 0.27583909034729004, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.014, + "step": 26420 + }, + { + "epoch": 1.5836778716519864, + "grad_norm": 0.392715722322464, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0183, + "step": 26430 + }, + { + "epoch": 1.5842770687279046, + "grad_norm": 0.19658122956752777, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0172, + "step": 26440 + }, + { + "epoch": 1.584876265803823, + "grad_norm": 0.8701423406600952, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.015, + "step": 26450 + }, + { + "epoch": 1.5854754628797412, + "grad_norm": 0.9331104159355164, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0152, + "step": 26460 + }, + { + "epoch": 1.5860746599556594, + "grad_norm": 0.29767271876335144, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.0143, + "step": 26470 + }, + { + "epoch": 1.5866738570315777, + "grad_norm": 0.3449382781982422, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.0151, + "step": 26480 + }, + { + "epoch": 1.587273054107496, + "grad_norm": 0.26225733757019043, + "learning_rate": 2.643185095075473e-06, + "loss": 0.0143, + "step": 26490 + }, + { + "epoch": 1.5878722511834142, + "grad_norm": 0.3581456243991852, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0138, + "step": 26500 + }, + { + "epoch": 1.5884714482593325, + "grad_norm": 0.246829554438591, + "learning_rate": 2.635965610168249e-06, + "loss": 0.0178, + "step": 26510 + }, + { + "epoch": 1.5890706453352508, + "grad_norm": 0.317020446062088, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0157, + "step": 26520 + }, + { + "epoch": 1.589669842411169, + "grad_norm": 0.3022174537181854, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0132, + "step": 26530 + }, + { + "epoch": 1.5902690394870873, + "grad_norm": 0.26253461837768555, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0135, + "step": 26540 + }, + { + "epoch": 1.5908682365630056, + "grad_norm": 0.2757222056388855, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0138, + "step": 26550 + }, + { + "epoch": 1.5914674336389238, + "grad_norm": 0.3857184052467346, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.013, + "step": 26560 + }, + { + "epoch": 1.5920666307148421, + "grad_norm": 0.4407658576965332, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0185, + "step": 26570 + }, + { + "epoch": 1.5926658277907604, + "grad_norm": 0.3413793444633484, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0135, + "step": 26580 + }, + { + "epoch": 1.5932650248666786, + "grad_norm": 0.24001765251159668, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0146, + "step": 26590 + }, + { + "epoch": 1.593864221942597, + "grad_norm": 0.4623468518257141, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.015, + "step": 26600 + }, + { + "epoch": 1.5944634190185152, + "grad_norm": 0.32984790205955505, + "learning_rate": 2.600457796416397e-06, + "loss": 0.0159, + "step": 26610 + }, + { + "epoch": 1.5950626160944334, + "grad_norm": 0.31533241271972656, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.0157, + "step": 26620 + }, + { + "epoch": 1.5956618131703517, + "grad_norm": 0.3851890563964844, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0168, + "step": 26630 + }, + { + "epoch": 1.59626101024627, + "grad_norm": 0.41252562403678894, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0165, + "step": 26640 + }, + { + "epoch": 1.5968602073221883, + "grad_norm": 0.473445326089859, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0123, + "step": 26650 + }, + { + "epoch": 1.5974594043981065, + "grad_norm": 0.3054860532283783, + "learning_rate": 2.583073279935805e-06, + "loss": 0.014, + "step": 26660 + }, + { + "epoch": 1.5980586014740248, + "grad_norm": 0.28879237174987793, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.0171, + "step": 26670 + }, + { + "epoch": 1.598657798549943, + "grad_norm": 0.32456526160240173, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0143, + "step": 26680 + }, + { + "epoch": 1.5992569956258613, + "grad_norm": 0.5708281993865967, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0163, + "step": 26690 + }, + { + "epoch": 1.5998561927017796, + "grad_norm": 0.6487006545066833, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.0169, + "step": 26700 + }, + { + "epoch": 1.6004553897776979, + "grad_norm": 0.3364347517490387, + "learning_rate": 2.565935706183804e-06, + "loss": 0.018, + "step": 26710 + }, + { + "epoch": 1.6010545868536161, + "grad_norm": 0.41275516152381897, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0153, + "step": 26720 + }, + { + "epoch": 1.6016537839295344, + "grad_norm": 0.391722708940506, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0161, + "step": 26730 + }, + { + "epoch": 1.6022529810054527, + "grad_norm": 0.3787323534488678, + "learning_rate": 2.555771903907403e-06, + "loss": 0.0174, + "step": 26740 + }, + { + "epoch": 1.602852178081371, + "grad_norm": 0.3075166940689087, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0145, + "step": 26750 + }, + { + "epoch": 1.6034513751572892, + "grad_norm": 0.3613744080066681, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0154, + "step": 26760 + }, + { + "epoch": 1.6040505722332075, + "grad_norm": 0.34713929891586304, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0114, + "step": 26770 + }, + { + "epoch": 1.6046497693091257, + "grad_norm": 0.4100549519062042, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.013, + "step": 26780 + }, + { + "epoch": 1.605248966385044, + "grad_norm": 0.3897320330142975, + "learning_rate": 2.5390304813179e-06, + "loss": 0.016, + "step": 26790 + }, + { + "epoch": 1.6058481634609623, + "grad_norm": 0.3584144413471222, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.015, + "step": 26800 + }, + { + "epoch": 1.6064473605368805, + "grad_norm": 0.31220853328704834, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0131, + "step": 26810 + }, + { + "epoch": 1.6070465576127988, + "grad_norm": 0.3192695379257202, + "learning_rate": 2.529104749380281e-06, + "loss": 0.0133, + "step": 26820 + }, + { + "epoch": 1.607645754688717, + "grad_norm": 0.30283334851264954, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0122, + "step": 26830 + }, + { + "epoch": 1.6082449517646353, + "grad_norm": 0.282143771648407, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0138, + "step": 26840 + }, + { + "epoch": 1.6088441488405536, + "grad_norm": 0.43043816089630127, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0155, + "step": 26850 + }, + { + "epoch": 1.609443345916472, + "grad_norm": 0.2672103941440582, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0153, + "step": 26860 + }, + { + "epoch": 1.6100425429923901, + "grad_norm": 0.39164942502975464, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0145, + "step": 26870 + }, + { + "epoch": 1.6106417400683086, + "grad_norm": 0.33121028542518616, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.014, + "step": 26880 + }, + { + "epoch": 1.6112409371442267, + "grad_norm": 0.46786385774612427, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0144, + "step": 26890 + }, + { + "epoch": 1.6118401342201452, + "grad_norm": 0.4348220229148865, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0136, + "step": 26900 + }, + { + "epoch": 1.6124393312960632, + "grad_norm": 0.7225855588912964, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0159, + "step": 26910 + }, + { + "epoch": 1.6130385283719817, + "grad_norm": 0.540884256362915, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0153, + "step": 26920 + }, + { + "epoch": 1.6136377254478997, + "grad_norm": 0.2984727919101715, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0127, + "step": 26930 + }, + { + "epoch": 1.6142369225238182, + "grad_norm": 0.34762996435165405, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0181, + "step": 26940 + }, + { + "epoch": 1.6148361195997363, + "grad_norm": 0.4229494035243988, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0184, + "step": 26950 + }, + { + "epoch": 1.6154353166756548, + "grad_norm": 0.4511129558086395, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0136, + "step": 26960 + }, + { + "epoch": 1.6160345137515728, + "grad_norm": 0.20887398719787598, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0136, + "step": 26970 + }, + { + "epoch": 1.6166337108274913, + "grad_norm": 0.27858126163482666, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0128, + "step": 26980 + }, + { + "epoch": 1.6172329079034093, + "grad_norm": 0.32049617171287537, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.013, + "step": 26990 + }, + { + "epoch": 1.6178321049793278, + "grad_norm": 0.4276943802833557, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0152, + "step": 27000 + }, + { + "epoch": 1.6184313020552459, + "grad_norm": 0.29610252380371094, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.0122, + "step": 27010 + }, + { + "epoch": 1.6190304991311644, + "grad_norm": 0.24043124914169312, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0116, + "step": 27020 + }, + { + "epoch": 1.6196296962070824, + "grad_norm": 0.33894526958465576, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0119, + "step": 27030 + }, + { + "epoch": 1.620228893283001, + "grad_norm": 0.2597903609275818, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.0144, + "step": 27040 + }, + { + "epoch": 1.620828090358919, + "grad_norm": 0.4067903459072113, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.0137, + "step": 27050 + }, + { + "epoch": 1.6214272874348374, + "grad_norm": 0.48484402894973755, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.0147, + "step": 27060 + }, + { + "epoch": 1.6220264845107555, + "grad_norm": 0.52725750207901, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.0175, + "step": 27070 + }, + { + "epoch": 1.622625681586674, + "grad_norm": 0.23465880751609802, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0123, + "step": 27080 + }, + { + "epoch": 1.623224878662592, + "grad_norm": 0.4273434579372406, + "learning_rate": 2.443811559007335e-06, + "loss": 0.015, + "step": 27090 + }, + { + "epoch": 1.6238240757385105, + "grad_norm": 0.2985517680644989, + "learning_rate": 2.440792688039862e-06, + "loss": 0.013, + "step": 27100 + }, + { + "epoch": 1.6244232728144286, + "grad_norm": 0.4334832727909088, + "learning_rate": 2.437783861778914e-06, + "loss": 0.0113, + "step": 27110 + }, + { + "epoch": 1.625022469890347, + "grad_norm": 0.2899027466773987, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.0153, + "step": 27120 + }, + { + "epoch": 1.625621666966265, + "grad_norm": 0.35197123885154724, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0143, + "step": 27130 + }, + { + "epoch": 1.6262208640421836, + "grad_norm": 0.25402888655662537, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0152, + "step": 27140 + }, + { + "epoch": 1.6268200611181016, + "grad_norm": 0.49205178022384644, + "learning_rate": 2.425849074243997e-06, + "loss": 0.014, + "step": 27150 + }, + { + "epoch": 1.6274192581940201, + "grad_norm": 0.2541142404079437, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0134, + "step": 27160 + }, + { + "epoch": 1.6280184552699382, + "grad_norm": 0.4348624646663666, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0126, + "step": 27170 + }, + { + "epoch": 1.6286176523458566, + "grad_norm": 0.33341577649116516, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0149, + "step": 27180 + }, + { + "epoch": 1.6292168494217747, + "grad_norm": 0.394909143447876, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0146, + "step": 27190 + }, + { + "epoch": 1.6298160464976932, + "grad_norm": 0.47289931774139404, + "learning_rate": 2.411157015949963e-06, + "loss": 0.0165, + "step": 27200 + }, + { + "epoch": 1.6304152435736112, + "grad_norm": 0.45220911502838135, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0116, + "step": 27210 + }, + { + "epoch": 1.6310144406495297, + "grad_norm": 0.36566999554634094, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0158, + "step": 27220 + }, + { + "epoch": 1.6316136377254478, + "grad_norm": 0.26231661438941956, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0131, + "step": 27230 + }, + { + "epoch": 1.6322128348013663, + "grad_norm": 0.32366135716438293, + "learning_rate": 2.399584779188417e-06, + "loss": 0.0131, + "step": 27240 + }, + { + "epoch": 1.6328120318772843, + "grad_norm": 0.3068046271800995, + "learning_rate": 2.396716944205467e-06, + "loss": 0.0123, + "step": 27250 + }, + { + "epoch": 1.6334112289532028, + "grad_norm": 0.28027409315109253, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.013, + "step": 27260 + }, + { + "epoch": 1.6340104260291208, + "grad_norm": 0.3580668270587921, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0133, + "step": 27270 + }, + { + "epoch": 1.6346096231050393, + "grad_norm": 0.42907601594924927, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0148, + "step": 27280 + }, + { + "epoch": 1.6352088201809574, + "grad_norm": 0.2437274307012558, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.014, + "step": 27290 + }, + { + "epoch": 1.6358080172568759, + "grad_norm": 0.3689195513725281, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0113, + "step": 27300 + }, + { + "epoch": 1.636407214332794, + "grad_norm": 0.48261409997940063, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0141, + "step": 27310 + }, + { + "epoch": 1.6370064114087124, + "grad_norm": 0.3526110351085663, + "learning_rate": 2.376924986395271e-06, + "loss": 0.018, + "step": 27320 + }, + { + "epoch": 1.6376056084846304, + "grad_norm": 0.23795528709888458, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0154, + "step": 27330 + }, + { + "epoch": 1.638204805560549, + "grad_norm": 0.40328165888786316, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0128, + "step": 27340 + }, + { + "epoch": 1.638804002636467, + "grad_norm": 0.4420272409915924, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0137, + "step": 27350 + }, + { + "epoch": 1.6394031997123855, + "grad_norm": 0.23652666807174683, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.014, + "step": 27360 + }, + { + "epoch": 1.6400023967883035, + "grad_norm": 0.3468151390552521, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0155, + "step": 27370 + }, + { + "epoch": 1.640601593864222, + "grad_norm": 0.35930299758911133, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.014, + "step": 27380 + }, + { + "epoch": 1.6412007909401403, + "grad_norm": 0.19394037127494812, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0148, + "step": 27390 + }, + { + "epoch": 1.6417999880160585, + "grad_norm": 0.35877296328544617, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.0136, + "step": 27400 + }, + { + "epoch": 1.6423991850919768, + "grad_norm": 0.29156941175460815, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.0128, + "step": 27410 + }, + { + "epoch": 1.642998382167895, + "grad_norm": 0.3780912756919861, + "learning_rate": 2.349511203900333e-06, + "loss": 0.015, + "step": 27420 + }, + { + "epoch": 1.6435975792438133, + "grad_norm": 0.3290363848209381, + "learning_rate": 2.3468256081258e-06, + "loss": 0.0152, + "step": 27430 + }, + { + "epoch": 1.6441967763197316, + "grad_norm": 0.5973288416862488, + "learning_rate": 2.344150167333397e-06, + "loss": 0.015, + "step": 27440 + }, + { + "epoch": 1.6447959733956499, + "grad_norm": 0.4506072402000427, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0146, + "step": 27450 + }, + { + "epoch": 1.6453951704715681, + "grad_norm": 0.32139888405799866, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0168, + "step": 27460 + }, + { + "epoch": 1.6459943675474864, + "grad_norm": 0.3994857370853424, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0152, + "step": 27470 + }, + { + "epoch": 1.6465935646234047, + "grad_norm": 0.26820749044418335, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0134, + "step": 27480 + }, + { + "epoch": 1.647192761699323, + "grad_norm": 0.3729577958583832, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0162, + "step": 27490 + }, + { + "epoch": 1.6477919587752412, + "grad_norm": 0.24220766127109528, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.0138, + "step": 27500 + }, + { + "epoch": 1.6483911558511595, + "grad_norm": 0.49408698081970215, + "learning_rate": 2.325706683525094e-06, + "loss": 0.017, + "step": 27510 + }, + { + "epoch": 1.6489903529270777, + "grad_norm": 0.22594054043293, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0148, + "step": 27520 + }, + { + "epoch": 1.649589550002996, + "grad_norm": 0.41143184900283813, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0152, + "step": 27530 + }, + { + "epoch": 1.6501887470789143, + "grad_norm": 0.3367273509502411, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0138, + "step": 27540 + }, + { + "epoch": 1.6507879441548325, + "grad_norm": 0.6019514203071594, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0168, + "step": 27550 + }, + { + "epoch": 1.6513871412307508, + "grad_norm": 0.5941750407218933, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.013, + "step": 27560 + }, + { + "epoch": 1.651986338306669, + "grad_norm": 0.43502920866012573, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0134, + "step": 27570 + }, + { + "epoch": 1.6525855353825873, + "grad_norm": 0.32287806272506714, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0131, + "step": 27580 + }, + { + "epoch": 1.6531847324585056, + "grad_norm": 0.4743358790874481, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0198, + "step": 27590 + }, + { + "epoch": 1.6537839295344239, + "grad_norm": 0.29685747623443604, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0147, + "step": 27600 + }, + { + "epoch": 1.6543831266103421, + "grad_norm": 0.4355921447277069, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0136, + "step": 27610 + }, + { + "epoch": 1.6549823236862604, + "grad_norm": 0.4096180498600006, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.013, + "step": 27620 + }, + { + "epoch": 1.6555815207621787, + "grad_norm": 0.3704766631126404, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.0152, + "step": 27630 + }, + { + "epoch": 1.656180717838097, + "grad_norm": 0.4177798628807068, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0147, + "step": 27640 + }, + { + "epoch": 1.6567799149140152, + "grad_norm": 0.32486793398857117, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0125, + "step": 27650 + }, + { + "epoch": 1.6573791119899335, + "grad_norm": 0.3335772752761841, + "learning_rate": 2.287865908463585e-06, + "loss": 0.0155, + "step": 27660 + }, + { + "epoch": 1.6579783090658518, + "grad_norm": 0.4169732332229614, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.0153, + "step": 27670 + }, + { + "epoch": 1.65857750614177, + "grad_norm": 0.2390674203634262, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0161, + "step": 27680 + }, + { + "epoch": 1.6591767032176883, + "grad_norm": 0.41580212116241455, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0116, + "step": 27690 + }, + { + "epoch": 1.6597759002936066, + "grad_norm": 0.3981385827064514, + "learning_rate": 2.278163146933236e-06, + "loss": 0.013, + "step": 27700 + }, + { + "epoch": 1.6603750973695248, + "grad_norm": 0.3737584948539734, + "learning_rate": 2.275763038367336e-06, + "loss": 0.011, + "step": 27710 + }, + { + "epoch": 1.660974294445443, + "grad_norm": 0.2370023876428604, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0173, + "step": 27720 + }, + { + "epoch": 1.6615734915213614, + "grad_norm": 0.6599531769752502, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0145, + "step": 27730 + }, + { + "epoch": 1.6621726885972796, + "grad_norm": 0.3255928158760071, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0149, + "step": 27740 + }, + { + "epoch": 1.662771885673198, + "grad_norm": 0.28063544631004333, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0157, + "step": 27750 + }, + { + "epoch": 1.6633710827491162, + "grad_norm": 0.300642192363739, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0139, + "step": 27760 + }, + { + "epoch": 1.6639702798250344, + "grad_norm": 0.3485228717327118, + "learning_rate": 2.261577490652103e-06, + "loss": 0.0139, + "step": 27770 + }, + { + "epoch": 1.6645694769009527, + "grad_norm": 0.31508076190948486, + "learning_rate": 2.259249109209093e-06, + "loss": 0.0162, + "step": 27780 + }, + { + "epoch": 1.665168673976871, + "grad_norm": 0.4764767587184906, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0145, + "step": 27790 + }, + { + "epoch": 1.6657678710527892, + "grad_norm": 0.26427552103996277, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.013, + "step": 27800 + }, + { + "epoch": 1.6663670681287075, + "grad_norm": 0.5152391791343689, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.015, + "step": 27810 + }, + { + "epoch": 1.6669662652046258, + "grad_norm": 0.4326762855052948, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0111, + "step": 27820 + }, + { + "epoch": 1.667565462280544, + "grad_norm": 0.3035188913345337, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0141, + "step": 27830 + }, + { + "epoch": 1.6681646593564623, + "grad_norm": 0.49474793672561646, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0129, + "step": 27840 + }, + { + "epoch": 1.6687638564323806, + "grad_norm": 0.46236565709114075, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.016, + "step": 27850 + }, + { + "epoch": 1.6693630535082988, + "grad_norm": 0.31711387634277344, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.013, + "step": 27860 + }, + { + "epoch": 1.669962250584217, + "grad_norm": 0.4073173701763153, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.0124, + "step": 27870 + }, + { + "epoch": 1.6705614476601354, + "grad_norm": 0.3320833742618561, + "learning_rate": 2.236529916369313e-06, + "loss": 0.0172, + "step": 27880 + }, + { + "epoch": 1.6711606447360536, + "grad_norm": 0.4608694314956665, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0143, + "step": 27890 + }, + { + "epoch": 1.671759841811972, + "grad_norm": 0.9055055975914001, + "learning_rate": 2.232109406453595e-06, + "loss": 0.017, + "step": 27900 + }, + { + "epoch": 1.6723590388878904, + "grad_norm": 0.19240455329418182, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0128, + "step": 27910 + }, + { + "epoch": 1.6729582359638084, + "grad_norm": 0.2756566107273102, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0157, + "step": 27920 + }, + { + "epoch": 1.673557433039727, + "grad_norm": 0.47067585587501526, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0132, + "step": 27930 + }, + { + "epoch": 1.674156630115645, + "grad_norm": 0.421377032995224, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0137, + "step": 27940 + }, + { + "epoch": 1.6747558271915635, + "grad_norm": 0.437125563621521, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0119, + "step": 27950 + }, + { + "epoch": 1.6753550242674815, + "grad_norm": 0.3617478311061859, + "learning_rate": 2.219094909262834e-06, + "loss": 0.0159, + "step": 27960 + }, + { + "epoch": 1.6759542213434, + "grad_norm": 0.39676180481910706, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0099, + "step": 27970 + }, + { + "epoch": 1.676553418419318, + "grad_norm": 0.24751955270767212, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.0121, + "step": 27980 + }, + { + "epoch": 1.6771526154952365, + "grad_norm": 0.5263744592666626, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0144, + "step": 27990 + }, + { + "epoch": 1.6777518125711546, + "grad_norm": 0.28027406334877014, + "learning_rate": 2.210624641425579e-06, + "loss": 0.0119, + "step": 28000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.748965507895132e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-28000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..73866eb4a03632a0fac02605d32ecb705c47ffdb --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/added_tokens.json @@ -0,0 +1,50 @@ +{ + " 0": 256000, + " 1": 256001, + " 10": 256010, + " 11": 256011, + " 12": 256012, + " 13": 256013, + " 14": 256014, + " 15": 256015, + " 16": 256016, + " 17": 256017, + " 18": 256018, + " 19": 256019, + " 2": 256002, + " 20": 256020, + " 21": 256021, + " 22": 256022, + " 23": 256023, + " 24": 256024, + " 25": 256025, + " 26": 256026, + " 27": 256027, + " 28": 256028, + " 29": 256029, + " 3": 256003, + " 30": 256030, + " 31": 256031, + " 32": 256032, + " 33": 256033, + " 34": 256034, + " 35": 256035, + " 36": 256036, + " 37": 256037, + " 38": 256038, + " 39": 256039, + " 4": 256004, + " 40": 256040, + " 41": 256041, + " 42": 256042, + " 43": 256043, + " 44": 256044, + " 45": 256045, + " 46": 256046, + " 47": 256047, + " 5": 256005, + " 6": 256006, + " 7": 256007, + " 8": 256008, + " 9": 256009 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..6176b91aa01592539d01c66b0ead9d200cf5c474 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": false, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ad9c79103582f8a1ffab88ea40d333e1b939f193 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f3654e6151059af0e9d70dae611c3f12b77f2dbb2d781046c7a4e2bb0c3d40e +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8053a2a7749259e01ed494e85872dda42991a232 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a993cdf429e151c934401b7b37c60b257aab6ccaad1894566e8de90f9e681d62 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea15f9ac0b63661f6fdb5577eaa7e1c8183bf5c4 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8a9614714e21f66d58f97e71778ea4e124243339033c047480027acdd850f86 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..94cca75ad6e7aa2f99648db8bff1b8c82e9f8951 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/trainer_state.json @@ -0,0 +1,21034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7975912277548085, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.7277513146400452, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0304, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.6415050029754639, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0292, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.48691895604133606, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0337, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.53068608045578, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0338, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.5464624762535095, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0303, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.3911614418029785, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0345, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.6894099116325378, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0365, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.5268317461013794, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0405, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.8635499477386475, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0321, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21542859077453613, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0264, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.6257337331771851, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0355, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.6525475978851318, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0304, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.4599299430847168, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0314, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.7497361898422241, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.031, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.3124896287918091, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0257, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.6170748472213745, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0323, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.4619428515434265, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0315, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.5088011026382446, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0255, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.5397948622703552, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0265, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.457082062959671, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0279, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.4131294786930084, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0269, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 1.1949660778045654, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.6057063341140747, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0306, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.26918280124664307, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0283, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.48841091990470886, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0323, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.6195886135101318, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0295, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.5798623561859131, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.031, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.4877539277076721, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0267, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.33261221647262573, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0261, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.8361077904701233, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0311, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.305922269821167, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0302, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.22662357985973358, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.028, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.4273515045642853, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0307, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.521216869354248, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0277, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.7090896368026733, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0346, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.3693661391735077, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.3651321530342102, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0263, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.5577923655509949, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0357, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.6504148840904236, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0404, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.49205282330513, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.035, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.6053458452224731, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0328, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.5949649214744568, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0302, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.5310356020927429, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0264, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4087911546230316, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.35929426550865173, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0274, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.5112904906272888, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0253, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.39148232340812683, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0305, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.47718697786331177, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.620936393737793, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0289, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.8953443169593811, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0328, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.4663226902484894, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0302, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.707167387008667, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0319, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.5325813889503479, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0318, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.6239158511161804, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0289, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.38823947310447693, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0266, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.48849165439605713, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0234, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.23214028775691986, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0276, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3467197120189667, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0282, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.2009357064962387, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0298, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.8589951395988464, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0264, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.43969056010246277, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0292, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.5750611424446106, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0289, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.5399556756019592, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0307, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.20517395436763763, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0249, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.7490189671516418, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0246, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.6661257743835449, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0325, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.571394681930542, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0342, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.8792482018470764, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0332, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.5770248770713806, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0286, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.62962406873703, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0246, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.4651380479335785, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.037, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.5087499022483826, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0265, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.44421979784965515, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0306, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.6521517038345337, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0334, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.5384942889213562, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0296, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.41909387707710266, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0297, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.6697047352790833, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0331, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.4015032947063446, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0326, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.48070228099823, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0278, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.8651071786880493, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0242, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 1.17703378200531, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0288, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.45865103602409363, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0322, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.41243845224380493, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0297, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.482997864484787, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0305, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.5319142937660217, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.6116752028465271, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0311, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.4214901328086853, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0269, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.6246733069419861, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.026, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.4263368248939514, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0305, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.4059041738510132, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.022, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.6362516283988953, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0265, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.2905973494052887, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0297, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.42270833253860474, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0255, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.26410749554634094, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0252, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.7570974230766296, + "learning_rate": 1.153689339251154e-05, + "loss": 0.027, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.5941224098205566, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.3985750079154968, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0337, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.3877560496330261, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.024, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.44742006063461304, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0284, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.3280893564224243, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0318, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0341, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.4976208806037903, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0239, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.6153465509414673, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0252, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.6112402677536011, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.4973732531070709, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0307, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.5871816277503967, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0254, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 1.2150986194610596, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.6406526565551758, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0265, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.4251798093318939, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0269, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.4702431857585907, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0311, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.3235304355621338, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0236, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.4913889467716217, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0231, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.4980977177619934, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0289, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.740922212600708, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0334, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.3305300772190094, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0301, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.7037357091903687, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0311, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.44783756136894226, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0339, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.7776843309402466, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0349, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.49181437492370605, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0285, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.333814799785614, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0284, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 1.203652262687683, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0365, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.521643877029419, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0313, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.33309581875801086, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0265, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.48567256331443787, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0357, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8473871946334839, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0355, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.43827518820762634, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0266, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.5849157571792603, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0317, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.5690399408340454, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0266, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.6484784483909607, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0294, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.8894811272621155, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0239, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4575272798538208, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0323, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.4288756847381592, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.032, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.8871303200721741, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.5861580967903137, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0335, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.4159319996833801, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0247, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.6948496699333191, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0299, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.5089551210403442, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0333, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.6912631392478943, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0303, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0295, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.4634060561656952, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0261, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.5664045214653015, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0262, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.7963227033615112, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0278, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.45378491282463074, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0268, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.8970746994018555, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0271, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.5109472274780273, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0307, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.5023297667503357, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0263, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.6055631041526794, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0285, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.38602766394615173, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0282, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.5447302460670471, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0319, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.6613780856132507, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0271, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 1.0358555316925049, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.026, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.4463629722595215, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.5373798608779907, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.025, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.7735916972160339, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0325, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.5017692446708679, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0262, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.3406142592430115, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0271, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.28971537947654724, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0238, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.45441415905952454, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0261, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.4653581976890564, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.026, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.5449947714805603, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0314, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.41015395522117615, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0272, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.5936392545700073, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0269, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0256, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.6176534295082092, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0285, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.6774734258651733, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0268, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.7045454978942871, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0305, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.5905448794364929, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0284, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.7881343364715576, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0321, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.6635507941246033, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0284, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.46298888325691223, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0394, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.5187172889709473, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0257, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.5974661707878113, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0305, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.5171123743057251, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0275, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.35988888144493103, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0295, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.30543047189712524, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0334, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.6582810878753662, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0309, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.4986134171485901, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0294, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.5560855269432068, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0224, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.28974607586860657, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0313, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.24015791714191437, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.026, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.2704199552536011, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0244, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.6661707162857056, + "learning_rate": 1.068904422762975e-05, + "loss": 0.027, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.5058556795120239, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0254, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.7086800336837769, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0242, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.6752822399139404, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0262, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.8279762268066406, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0312, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.5070614814758301, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0308, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.3933897614479065, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0287, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.37238794565200806, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0325, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.7591347098350525, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0265, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.4841652810573578, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0331, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.45236295461654663, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0412, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.4774094820022583, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0289, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.47564345598220825, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0294, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.341337651014328, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0281, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.341701865196228, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0224, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.6621959209442139, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0283, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.348466694355011, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0234, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.35208311676979065, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0248, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.4973156154155731, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0246, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.3668982982635498, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0228, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.4771873950958252, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0303, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.3595021665096283, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0265, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.6013099551200867, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0297, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.40996676683425903, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0321, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.45742037892341614, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0288, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.8092222213745117, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0278, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.32741186022758484, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0288, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.5716732740402222, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0256, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.3263239562511444, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0271, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.35390567779541016, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0266, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.36520150303840637, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0265, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.46227532625198364, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.40079647302627563, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0327, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.3689155578613281, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0249, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.49527907371520996, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.029, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.38931334018707275, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0233, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.5698918700218201, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0269, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 1.0959579944610596, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.029, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.6321646571159363, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0276, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.7166606783866882, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0292, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.6464444994926453, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0246, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.7318128347396851, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0296, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.4828032851219177, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0247, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.4509548842906952, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0241, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.413630872964859, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0313, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.42443349957466125, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0316, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.8199112415313721, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0389, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.28918105363845825, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0242, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.6759344339370728, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0308, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.5480250120162964, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.025, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.48897549510002136, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.027, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.6111220121383667, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0276, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.8852546215057373, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0251, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.5098162889480591, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.022, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.45974940061569214, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0206, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3925095200538635, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0251, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.5461363792419434, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0217, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.5685333609580994, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0231, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.494150310754776, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0243, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.8770614862442017, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0286, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.27142134308815, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0253, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.3365682363510132, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0241, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.5512370467185974, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0242, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.5581703782081604, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0276, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.306773841381073, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0262, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.44620928168296814, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0229, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.5870804786682129, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0228, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.26162099838256836, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0278, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.27250319719314575, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0293, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.8330137729644775, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0315, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.5206989645957947, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0282, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.5408382415771484, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0359, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.30517199635505676, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0267, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.5315027236938477, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0206, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.46061626076698303, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0222, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.47393080592155457, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0262, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.3686772882938385, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0254, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.3312757611274719, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0243, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.565447986125946, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0267, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.5690101385116577, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0237, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.44088438153266907, + "learning_rate": 9.911670744652783e-06, + "loss": 0.028, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.3708919882774353, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0265, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.589698851108551, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0297, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.6541375517845154, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0288, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.5304558873176575, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0243, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.5774737000465393, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0277, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.5616280436515808, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0267, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.6129759550094604, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0223, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.45278221368789673, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0304, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.44487202167510986, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0296, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.5391712188720703, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0256, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.43523359298706055, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0277, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.5308435559272766, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0242, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.3361283540725708, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0236, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.3764631450176239, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0304, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.9003425240516663, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0278, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.2787775993347168, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0219, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.40089285373687744, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0284, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.3619711101055145, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0252, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.7354542016983032, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0242, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.3854006826877594, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0302, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.3318389058113098, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0265, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.5286651849746704, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0235, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.24921932816505432, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0259, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.7376067042350769, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0238, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.35099226236343384, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0257, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.3805389702320099, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0198, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.4433703124523163, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0241, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.3667793571949005, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0268, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.2963331639766693, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0223, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.9817414879798889, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0248, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.6529688835144043, + "learning_rate": 9.612315882780393e-06, + "loss": 0.032, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.7663154602050781, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0267, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.6086964011192322, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0281, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.5240464806556702, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0339, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.6558368802070618, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0284, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.6192268133163452, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0309, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.5293763875961304, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0257, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.38831329345703125, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0239, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.12827467918396, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0323, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.411818265914917, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0274, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.5521355867385864, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0233, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.26673075556755066, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0317, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.5205486416816711, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0273, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.8010990619659424, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0292, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.420612633228302, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0274, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.4811270236968994, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0277, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.4959382712841034, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0288, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.4607725739479065, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0245, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.9101414680480957, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0283, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.38626620173454285, + "learning_rate": 9.42959233811777e-06, + "loss": 0.026, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.5709372758865356, + "learning_rate": 9.419993062475743e-06, + "loss": 0.021, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.4417913854122162, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0291, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.5651213526725769, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0228, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.4716165363788605, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0242, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.9120892286300659, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0296, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.5004292130470276, + "learning_rate": 9.372024722887089e-06, + "loss": 0.033, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.3422714173793793, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0284, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.5391610264778137, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0362, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.5446203351020813, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0247, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.5441875457763672, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0284, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.48274070024490356, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0245, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.6035326719284058, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0226, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.3104001581668854, + "learning_rate": 9.304949604077693e-06, + "loss": 0.029, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.27859869599342346, + "learning_rate": 9.295375311262483e-06, + "loss": 0.022, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.3896406292915344, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0235, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.4526473581790924, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0289, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.6624506115913391, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0265, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.6976125836372375, + "learning_rate": 9.257098257046206e-06, + "loss": 0.029, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.5974310040473938, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0205, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.7627739906311035, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0333, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.3166525065898895, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0309, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.41519322991371155, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0223, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.31840237975120544, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0239, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.47412827610969543, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0228, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.41170552372932434, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0209, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.45858854055404663, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0243, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.7870534658432007, + "learning_rate": 9.171095634265995e-06, + "loss": 0.027, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.4080354869365692, + "learning_rate": 9.161550369445782e-06, + "loss": 0.023, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.47916823625564575, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0303, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.6911760568618774, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0263, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.3980148732662201, + "learning_rate": 9.132927564918328e-06, + "loss": 0.028, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.47085851430892944, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0266, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.5085862874984741, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0239, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.5219245553016663, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0267, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.5199264287948608, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0277, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.6157195568084717, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0343, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.5366696715354919, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0271, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.3640076220035553, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0258, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.5320505499839783, + "learning_rate": 9.05669731553499e-06, + "loss": 0.024, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.507826566696167, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0253, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.741392195224762, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0242, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.5325136184692383, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0224, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.4709665775299072, + "learning_rate": 9.018636566864313e-06, + "loss": 0.026, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.4371986985206604, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0264, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.47594818472862244, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0224, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.488423228263855, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0261, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.24745763838291168, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0206, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.5042629837989807, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0305, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.5255836844444275, + "learning_rate": 8.961615424107555e-06, + "loss": 0.026, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.4605107307434082, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0274, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.3252561390399933, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0227, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.35779184103012085, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0296, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.2960403263568878, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0212, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.6344659328460693, + "learning_rate": 8.914163487132906e-06, + "loss": 0.026, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.4614463150501251, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0234, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4490053951740265, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0265, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.5291271209716797, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0326, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.5311887264251709, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0257, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.5647584199905396, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0295, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.3913862705230713, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0256, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.4476219415664673, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0248, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.7807655930519104, + "learning_rate": 8.83836825410936e-06, + "loss": 0.026, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.38984328508377075, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0247, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.5757346153259277, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0296, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.25636178255081177, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0222, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.45617344975471497, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0224, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.3066493272781372, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0237, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.26513972878456116, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0277, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.445230633020401, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0248, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.4914413392543793, + "learning_rate": 8.762735374981932e-06, + "loss": 0.022, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.41469570994377136, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0245, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.33235347270965576, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0229, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.4890037775039673, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0247, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.41330578923225403, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0285, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.6309427618980408, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0233, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.42090296745300293, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0254, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.5888519287109375, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0262, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.5488774180412292, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0262, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.48015111684799194, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0219, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.4484168291091919, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0276, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.4128018319606781, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0218, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.5151517987251282, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0242, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.6248350143432617, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0267, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.4116908013820648, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0242, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.6138579249382019, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0282, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.22843605279922485, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0284, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.49555841088294983, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0244, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.5752411484718323, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0275, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.5129706859588623, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0237, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.751230001449585, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0257, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.47749435901641846, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0277, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21702095866203308, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0255, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.30658838152885437, + "learning_rate": 8.54624657467318e-06, + "loss": 0.024, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.3589625954627991, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0215, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.5434426069259644, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0224, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.8732438683509827, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0289, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.34988290071487427, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0226, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.4021032154560089, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0248, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.4676196873188019, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0235, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.41646474599838257, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0235, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.5892519950866699, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0221, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.5757095217704773, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0258, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.4664652645587921, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0275, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.4674879014492035, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0285, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.7277936339378357, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0316, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.40373867750167847, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0213, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.8632686138153076, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0239, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.5620945692062378, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0259, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3430384695529938, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0287, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.46981969475746155, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0218, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.3494231700897217, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0238, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.514975368976593, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0205, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.6442168951034546, + "learning_rate": 8.359228888944986e-06, + "loss": 0.021, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.32178881764411926, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0219, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.48865941166877747, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0261, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.6131434440612793, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0269, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.4471806585788727, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0251, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.8255780935287476, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0229, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.843673586845398, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0278, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.4278610348701477, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0228, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.5036011338233948, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0291, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.5141382813453674, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0217, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.8976346850395203, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0248, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.5634751319885254, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0276, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.5327013731002808, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0279, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.2723959982395172, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0225, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.4455258846282959, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0222, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.3784103989601135, + "learning_rate": 8.219774325200873e-06, + "loss": 0.024, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.8102694749832153, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0231, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.5179240703582764, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0255, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.39830490946769714, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0264, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.32860279083251953, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0241, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.5459582209587097, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0193, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.3841477036476135, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0282, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.7849119305610657, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0319, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.4457703232765198, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0279, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.30464428663253784, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0184, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 1.0635287761688232, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0265, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.33294421434402466, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0235, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.5644985437393188, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0218, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.4975566565990448, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0261, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.7503839135169983, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0218, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.35363277792930603, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0198, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.43968406319618225, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0253, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.4553394615650177, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0266, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.45489153265953064, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0264, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.424696147441864, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0209, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.4819740653038025, + "learning_rate": 8.03498318084394e-06, + "loss": 0.022, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.564834475517273, + "learning_rate": 8.025779439806006e-06, + "loss": 0.024, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.7905157804489136, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0261, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.6985124349594116, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0315, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.42378291487693787, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0237, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.5980759263038635, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0217, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.45916232466697693, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0235, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.25486481189727783, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0231, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.4072360694408417, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0261, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3813820481300354, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0209, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.3040210008621216, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0225, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.30910906195640564, + "learning_rate": 7.933935782312965e-06, + "loss": 0.026, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.6573566794395447, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0262, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.30632153153419495, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0251, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.3277539610862732, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0233, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.49589917063713074, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0211, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.4149130880832672, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0203, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.7051926851272583, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0272, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.8553881049156189, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0236, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.5676615238189697, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0242, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.29548707604408264, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0236, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.36076608300209045, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0219, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.3657922148704529, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0227, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.27593615651130676, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0251, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.35554730892181396, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0259, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.45652297139167786, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0274, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.5757999420166016, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0222, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.5138059854507446, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0216, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.338874876499176, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0232, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.48215195536613464, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0226, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.30239933729171753, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0205, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.6099343299865723, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0219, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.6730902791023254, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0239, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.4575020968914032, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0204, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.2673267424106598, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0222, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.3593531548976898, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0225, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.5385488867759705, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0248, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.3900541663169861, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0277, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.6182276010513306, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0241, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.4897976815700531, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0229, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.5717247128486633, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0273, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.4837515950202942, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0219, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.31954509019851685, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0271, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.23005163669586182, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0204, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.500217616558075, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0229, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.47326523065567017, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0203, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.5074726939201355, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0249, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.6583673357963562, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0243, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.7585731744766235, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0264, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.3782348036766052, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0216, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.43963512778282166, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0201, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.6450467109680176, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0254, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.3420482575893402, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0224, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.3532888889312744, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0216, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.32494598627090454, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0196, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.2898419499397278, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0234, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.4379838705062866, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0233, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.5390518307685852, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0169, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.3786150813102722, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0203, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.3376149833202362, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0266, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.40810349583625793, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0241, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.24485738575458527, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0199, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.4670563340187073, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0184, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.374255508184433, + "learning_rate": 7.4623904967312e-06, + "loss": 0.018, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.4191536605358124, + "learning_rate": 7.453427567620127e-06, + "loss": 0.022, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3807078003883362, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0232, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.7537381649017334, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0202, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.36507129669189453, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0236, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.24461498856544495, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0221, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.351654589176178, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0236, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.35627686977386475, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0213, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.4586603343486786, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0304, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.4082098603248596, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0237, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.47707459330558777, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0247, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.4687316119670868, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0344, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.4660017788410187, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0214, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.4644101560115814, + "learning_rate": 7.346200065486093e-06, + "loss": 0.022, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.3139079213142395, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0234, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.36445188522338867, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0262, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.6457782983779907, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0261, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.4184044599533081, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0245, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.44356703758239746, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0215, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.5394402742385864, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0302, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 0.5960429906845093, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0234, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2850514352321625, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0243, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.45071718096733093, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0233, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.3157344162464142, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0254, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.45518410205841064, + "learning_rate": 7.248450164740439e-06, + "loss": 0.024, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.2323702722787857, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0226, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.6025039553642273, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0246, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.4983830749988556, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0199, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.3684524595737457, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0252, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.36924007534980774, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0277, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.3531496822834015, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0228, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.3995579779148102, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0193, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.4124946892261505, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0221, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.3897329866886139, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0221, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.45230787992477417, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0238, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.45878538489341736, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0244, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.4302407503128052, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0237, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.30422642827033997, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0173, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.49566513299942017, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0201, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.43262094259262085, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0227, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.8250450491905212, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0259, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.3265332281589508, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0205, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.2871774435043335, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0201, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.4341558814048767, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0199, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.43365293741226196, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0201, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.5876246690750122, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0205, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.2719171643257141, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0211, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.38791123032569885, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0244, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.4082484543323517, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0206, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.5010205507278442, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0245, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.4404369294643402, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0268, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.5171347856521606, + "learning_rate": 7.010805483338283e-06, + "loss": 0.024, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.5137951970100403, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0241, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.563709557056427, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0193, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.44687238335609436, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0207, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.33815798163414, + "learning_rate": 6.975884226362e-06, + "loss": 0.0246, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.33789384365081787, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0206, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.38053908944129944, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0195, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.5730066299438477, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0199, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.42453598976135254, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0218, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.48010921478271484, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0328, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.5227254629135132, + "learning_rate": 6.923644220932124e-06, + "loss": 0.019, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4078599810600281, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0212, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.4473094046115875, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0281, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.3459968864917755, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0231, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.4205886721611023, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0256, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.5397320985794067, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0214, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.6208626627922058, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0224, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.34377506375312805, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0197, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.4086950123310089, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0202, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.5211176872253418, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0201, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3705415725708008, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0219, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.32692769169807434, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0204, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.42599135637283325, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0213, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.565449595451355, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0223, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.4027825593948364, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0233, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.4833034574985504, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0309, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.5570312738418579, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0213, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.30241742730140686, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0197, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.37468239665031433, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0214, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.5555301904678345, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0223, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.6084730625152588, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0261, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.5931955575942993, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0237, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.30350545048713684, + "learning_rate": 6.733587654719298e-06, + "loss": 0.02, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.6784055233001709, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0281, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.5559973120689392, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0204, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.7529487013816833, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0235, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.7032052874565125, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0176, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.5018401741981506, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0197, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.5020368695259094, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0231, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.3605690598487854, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0254, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.3482762575149536, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0223, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.4260469675064087, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0199, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.23622000217437744, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0239, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.3683573007583618, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0223, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.32972025871276855, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0228, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.4159783124923706, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0221, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.24288412928581238, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0188, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.42375463247299194, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0183, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.26672226190567017, + "learning_rate": 6.596880604028027e-06, + "loss": 0.02, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.30816635489463806, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0219, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.315452516078949, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0218, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.5412175059318542, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0233, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.4290241003036499, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0233, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.3977762460708618, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0239, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.4023628532886505, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0197, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.8707197308540344, + "learning_rate": 6.53748481975927e-06, + "loss": 0.029, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.37878328561782837, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0218, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.685556173324585, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0248, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.5783588886260986, + "learning_rate": 6.512107839793337e-06, + "loss": 0.02, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.5456825494766235, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0279, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.6162738800048828, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0259, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.38887348771095276, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0198, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.5207514762878418, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0201, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.671120822429657, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0259, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.28870952129364014, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0175, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.3909374177455902, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0214, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.3419650197029114, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0217, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.563515305519104, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0185, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.6295453310012817, + "learning_rate": 6.427861749601945e-06, + "loss": 0.023, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.4404713213443756, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0188, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.698448121547699, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0225, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.5679222941398621, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0213, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.5237470269203186, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0261, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.4205586016178131, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0232, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.36608314514160156, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.02, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.49511757493019104, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0247, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.3475521206855774, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0202, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.36345914006233215, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0197, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.34304162859916687, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0183, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.41459065675735474, + "learning_rate": 6.335811156758245e-06, + "loss": 0.02, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.34139952063560486, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0211, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.29463231563568115, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0225, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.37984198331832886, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0201, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.21912901103496552, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0226, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.34660178422927856, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0179, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.6080809235572815, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0187, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.43388310074806213, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0226, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.53389972448349, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0237, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.39731428027153015, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0176, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.32715681195259094, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0211, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.36709150671958923, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0194, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.5554866790771484, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0202, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.26253199577331543, + "learning_rate": 6.227878992893104e-06, + "loss": 0.02, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.3686104714870453, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0191, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.36151114106178284, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0213, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.5019435882568359, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0203, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 1.1914043426513672, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0249, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.45042529702186584, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0244, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.3239169120788574, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0219, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.3253174424171448, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0226, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.6497724652290344, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0238, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.5800855159759521, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0211, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.29717954993247986, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0198, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.35056066513061523, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0219, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.28448906540870667, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0227, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.33300310373306274, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0165, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.5134487748146057, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0219, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.45153549313545227, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0191, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.6483689546585083, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0211, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.5660327076911926, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0207, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.6027820706367493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0201, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.6102983951568604, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0207, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.4383072257041931, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0275, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.42298370599746704, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0204, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.30508092045783997, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0195, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.6242369413375854, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0215, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.38399502635002136, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0201, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.4721924066543579, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0243, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.6958035230636597, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0201, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.3826717436313629, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0236, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.3098534941673279, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0216, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.43973061442375183, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0234, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.46570682525634766, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0226, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.46847036480903625, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0188, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.5139725804328918, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0195, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.48436662554740906, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0206, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.3445553481578827, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0241, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.8473356366157532, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0248, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.6241415143013, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0242, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.7302873730659485, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0224, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.29269692301750183, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0181, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.4065910577774048, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0253, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.36930134892463684, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0203, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.5521696209907532, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0208, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.3761119544506073, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0209, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.3330603241920471, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0233, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.27771884202957153, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0162, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.4225069284439087, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0177, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.33680275082588196, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0199, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.4399181604385376, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0236, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.49677175283432007, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0265, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.39700835943222046, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0193, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.4604041278362274, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0208, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.26002946496009827, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0197, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.3256632685661316, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0192, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3573099672794342, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0184, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.3116256892681122, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0197, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.39247608184814453, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0219, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.31291085481643677, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0194, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.5996116399765015, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0264, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.24854864180088043, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0207, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.5746667385101318, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0195, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.5744135975837708, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0182, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.5161272883415222, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0212, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.5889247059822083, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0172, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.53412926197052, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0209, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.3421672582626343, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0193, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.409906268119812, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0173, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.5139239430427551, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0198, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.5014253258705139, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0177, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.5942979454994202, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0206, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.218281552195549, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0204, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.43725427985191345, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0215, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.3467969000339508, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0168, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.2697127163410187, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0214, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.43687018752098083, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0262, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.47759339213371277, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0212, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.33211249113082886, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0228, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.29453045129776, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0233, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.34539318084716797, + "learning_rate": 5.608700869895367e-06, + "loss": 0.021, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.6664339900016785, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0203, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.21404555439949036, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0209, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.4320753812789917, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0236, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.415399968624115, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0235, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.2643829584121704, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0203, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.4354988932609558, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0172, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.43992263078689575, + "learning_rate": 5.554208267666996e-06, + "loss": 0.018, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.32208460569381714, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0183, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.27261701226234436, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0196, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.4348963499069214, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0173, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.40379852056503296, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0202, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.4592876136302948, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0219, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.4797484278678894, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0182, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.47892817854881287, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0185, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.46308979392051697, + "learning_rate": 5.492314644463202e-06, + "loss": 0.018, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.7745133638381958, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0207, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.6577957272529602, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0166, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.43036580085754395, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0218, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.41811347007751465, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0214, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.31980884075164795, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0198, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.3632652461528778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0209, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.467146635055542, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0173, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.5659807920455933, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0199, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.24540813267230988, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0178, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.3122001588344574, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0222, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2879388928413391, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0173, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.5185259580612183, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0168, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.239187091588974, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0198, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3844532370567322, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0179, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.3842040002346039, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0204, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.26496851444244385, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0172, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.40850451588630676, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0189, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21669425070285797, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0192, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.43664559721946716, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.021, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.49064821004867554, + "learning_rate": 5.339400468833427e-06, + "loss": 0.02, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.9060949683189392, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0204, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.3413904309272766, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0212, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.2620849311351776, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0201, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.3972470760345459, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0216, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.4422028064727783, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0177, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.2595955431461334, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0214, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.43522438406944275, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0226, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.33024686574935913, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0199, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.3532852232456207, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0194, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.3963644802570343, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0171, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.37003734707832336, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0174, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.27832016348838806, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0211, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.4203765392303467, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0196, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.31796127557754517, + "learning_rate": 5.233937303988081e-06, + "loss": 0.019, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.4561198949813843, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0198, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.4175209403038025, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0195, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.7017586827278137, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0201, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.4711352288722992, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.02, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.2737489640712738, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0198, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.44284430146217346, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0206, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.4556163251399994, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0208, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.3158712685108185, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0156, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.4620053172111511, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0187, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.7892107963562012, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0195, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.37334534525871277, + "learning_rate": 5.152002600477859e-06, + "loss": 0.02, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.4440039098262787, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0244, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.2650533616542816, + "learning_rate": 5.137194259935739e-06, + "loss": 0.017, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.5425522327423096, + "learning_rate": 5.129800405815733e-06, + "loss": 0.019, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.5764152407646179, + "learning_rate": 5.122413440701921e-06, + "loss": 0.018, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.3985585868358612, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0214, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.513511598110199, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0189, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.3784070909023285, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0164, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7029585242271423, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0201, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.28351524472236633, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0207, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.5500089526176453, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0222, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.35926392674446106, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0195, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.24845866858959198, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0198, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.3264683485031128, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0178, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.47955816984176636, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0206, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.31802570819854736, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0168, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.40685755014419556, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0223, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.4924621284008026, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0195, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.640724241733551, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0183, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.6712080836296082, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0196, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.34785783290863037, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0174, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.46851038932800293, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0186, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.6138949394226074, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0197, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.3083338439464569, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0179, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.3143295347690582, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0217, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.3330692946910858, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0149, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.2732333242893219, + "learning_rate": 4.961660586405147e-06, + "loss": 0.017, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.3350054621696472, + "learning_rate": 4.954434444590436e-06, + "loss": 0.022, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.2735322415828705, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0181, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.5919206738471985, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0201, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.28201058506965637, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0188, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.505592942237854, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0188, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.5231548547744751, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0184, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.3743092715740204, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0176, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.5908241271972656, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0224, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.4231952428817749, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0177, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.5666583180427551, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0218, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.4740161597728729, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0179, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.3947773873806, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.02, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.3114109933376312, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0223, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.44969403743743896, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0169, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.29602059721946716, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0168, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.3884619474411011, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0205, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.2929127514362335, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0149, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.4955149292945862, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0213, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.4021163582801819, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0192, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.2945493757724762, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.02, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34085726737976074, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0286, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.32751014828681946, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0226, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.3844929337501526, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0155, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.5286590456962585, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0229, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.26664429903030396, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0151, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.528367280960083, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0239, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5871155858039856, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0196, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.5686034560203552, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0184, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.40526366233825684, + "learning_rate": 4.755013723146175e-06, + "loss": 0.018, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.37055784463882446, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0184, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.5210561156272888, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0165, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.3386324942111969, + "learning_rate": 4.733984792194363e-06, + "loss": 0.018, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.40071168541908264, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0198, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.3415983319282532, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0168, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.3700709939002991, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0166, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.3559338450431824, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0174, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.5588265657424927, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0207, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.4539838433265686, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0164, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.34879690408706665, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0165, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.22862373292446136, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0158, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.5536275506019592, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0137, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5599532127380371, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0206, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.2961312532424927, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0138, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.5834526419639587, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0174, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.5941792726516724, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0205, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.2580801844596863, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0199, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3897567689418793, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0168, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.37937042117118835, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0213, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.3964179456233978, + "learning_rate": 4.616077433849538e-06, + "loss": 0.019, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.3632303476333618, + "learning_rate": 4.609208744970524e-06, + "loss": 0.015, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.5750122666358948, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0168, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.36310067772865295, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0172, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.5438339114189148, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0198, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.37394630908966064, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0202, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.2454962432384491, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0188, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.474844366312027, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0223, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.30256277322769165, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0188, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.500045657157898, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0179, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.609107494354248, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0182, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.20867787301540375, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0168, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.41653770208358765, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0172, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.357435941696167, + "learning_rate": 4.527371771040039e-06, + "loss": 0.017, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.5994096994400024, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0177, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.3150171935558319, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0164, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.4483601748943329, + "learning_rate": 4.507082898761475e-06, + "loss": 0.019, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.529812753200531, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0175, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.26758334040641785, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0187, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.3228643834590912, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0195, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.3437839150428772, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0207, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.28592896461486816, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0185, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.5544041991233826, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0191, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 1.0831762552261353, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0237, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.3546636700630188, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0203, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.32998642325401306, + "learning_rate": 4.446628604336844e-06, + "loss": 0.018, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.40987834334373474, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0189, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.6094655990600586, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0174, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.631481409072876, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0179, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.4069002866744995, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0192, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.36600202322006226, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0196, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.3092246353626251, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0185, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2811580300331116, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0162, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.4177345037460327, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0196, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.40211164951324463, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0199, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.31014713644981384, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0173, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.5378808379173279, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0189, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.3483606278896332, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0201, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.5112893581390381, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0209, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.26471400260925293, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.013, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.6770564317703247, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0174, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.4251134693622589, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0169, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.2985415458679199, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0212, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.4635870158672333, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0164, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.4360525906085968, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0174, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.6121042370796204, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0176, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.3049333095550537, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0162, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.46471482515335083, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0193, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.27093327045440674, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0204, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.3513331711292267, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0209, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.3452320396900177, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0172, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.44609951972961426, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0198, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.27217286825180054, + "learning_rate": 4.269026084410863e-06, + "loss": 0.016, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.5857428908348083, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0206, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.3834620714187622, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0165, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.34176892042160034, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0156, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.2497260719537735, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0183, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.3003418743610382, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0188, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.19922316074371338, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0162, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.5160003900527954, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0181, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.4917953312397003, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0197, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.2868032455444336, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0192, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.30980560183525085, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0178, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.31523144245147705, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0193, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.23731909692287445, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0171, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.4911767542362213, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0171, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.3095512390136719, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0165, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.6421821117401123, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0178, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.4887765645980835, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0212, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.4543951451778412, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0165, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.4595223367214203, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0144, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.6325511336326599, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0203, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.6220779418945312, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0225, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.3728989362716675, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0202, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.4958861470222473, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0204, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.32445529103279114, + "learning_rate": 4.122270968037107e-06, + "loss": 0.016, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.3969140350818634, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0174, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.39698946475982666, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0163, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.4633882939815521, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0179, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.36993899941444397, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0216, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.4137882590293884, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0187, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.320867121219635, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0238, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.3139745593070984, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0175, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.572628378868103, + "learning_rate": 4.072221948222934e-06, + "loss": 0.018, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.575975239276886, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0189, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.26301854848861694, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0121, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.3042408525943756, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0185, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.2503415644168854, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0208, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.3556166887283325, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0202, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.652975857257843, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0194, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.4215060770511627, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0166, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.2277296483516693, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0172, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.3370293378829956, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0201, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.4235946834087372, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0189, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 1.0387974977493286, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0176, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.7258256077766418, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0204, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.35412806272506714, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0165, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.5192556977272034, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0166, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.3292843699455261, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0163, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.46782153844833374, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0174, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.6324945092201233, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0183, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.4347882568836212, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0138, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.3393082320690155, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0155, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.28411221504211426, + "learning_rate": 3.949383948670156e-06, + "loss": 0.016, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.45982369780540466, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0134, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.32810381054878235, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0163, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.5996097922325134, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0246, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.40002167224884033, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0158, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.4102090299129486, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0179, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.44915929436683655, + "learning_rate": 3.913175335139808e-06, + "loss": 0.019, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.251206636428833, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0183, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.2564012408256531, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0182, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.431265652179718, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0177, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.42389997839927673, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0146, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.9380725622177124, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0206, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.3655669093132019, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0151, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.3248157501220703, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0152, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.5733596086502075, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0175, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.4672720730304718, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0185, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.22989575564861298, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0165, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 1.0956321954727173, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0181, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.39079031348228455, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0212, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.3974068760871887, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0197, + "step": 24000 + }, + { + "epoch": 1.4386721792797652, + "grad_norm": 1.1926871538162231, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0185, + "step": 24010 + }, + { + "epoch": 1.4392713763556833, + "grad_norm": 0.40923064947128296, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0171, + "step": 24020 + }, + { + "epoch": 1.4398705734316017, + "grad_norm": 0.38384920358657837, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0161, + "step": 24030 + }, + { + "epoch": 1.4404697705075198, + "grad_norm": 0.21791735291481018, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0168, + "step": 24040 + }, + { + "epoch": 1.4410689675834383, + "grad_norm": 0.3207184672355652, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0147, + "step": 24050 + }, + { + "epoch": 1.4416681646593565, + "grad_norm": 0.4831724166870117, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0182, + "step": 24060 + }, + { + "epoch": 1.4422673617352748, + "grad_norm": 0.47996360063552856, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0175, + "step": 24070 + }, + { + "epoch": 1.442866558811193, + "grad_norm": 0.41330286860466003, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0172, + "step": 24080 + }, + { + "epoch": 1.4434657558871113, + "grad_norm": 0.5012956857681274, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0234, + "step": 24090 + }, + { + "epoch": 1.4440649529630296, + "grad_norm": 0.4715912640094757, + "learning_rate": 3.777162510056721e-06, + "loss": 0.016, + "step": 24100 + }, + { + "epoch": 1.4446641500389479, + "grad_norm": 0.3817141652107239, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0158, + "step": 24110 + }, + { + "epoch": 1.4452633471148661, + "grad_norm": 0.3964484930038452, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0216, + "step": 24120 + }, + { + "epoch": 1.4458625441907844, + "grad_norm": 0.29786166548728943, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0197, + "step": 24130 + }, + { + "epoch": 1.4464617412667027, + "grad_norm": 0.2796359360218048, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.018, + "step": 24140 + }, + { + "epoch": 1.447060938342621, + "grad_norm": 0.30957916378974915, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0154, + "step": 24150 + }, + { + "epoch": 1.4476601354185392, + "grad_norm": 0.3837800920009613, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0171, + "step": 24160 + }, + { + "epoch": 1.4482593324944575, + "grad_norm": 0.29726749658584595, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0167, + "step": 24170 + }, + { + "epoch": 1.4488585295703758, + "grad_norm": 0.4624067544937134, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0187, + "step": 24180 + }, + { + "epoch": 1.449457726646294, + "grad_norm": 0.46996721625328064, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0156, + "step": 24190 + }, + { + "epoch": 1.4500569237222123, + "grad_norm": 0.351532518863678, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0221, + "step": 24200 + }, + { + "epoch": 1.4506561207981306, + "grad_norm": 0.5119938254356384, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0194, + "step": 24210 + }, + { + "epoch": 1.4512553178740488, + "grad_norm": 0.5102914571762085, + "learning_rate": 3.707974016467e-06, + "loss": 0.0152, + "step": 24220 + }, + { + "epoch": 1.451854514949967, + "grad_norm": 0.4638414680957794, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0167, + "step": 24230 + }, + { + "epoch": 1.4524537120258854, + "grad_norm": 0.6181433200836182, + "learning_rate": 3.696562092850226e-06, + "loss": 0.016, + "step": 24240 + }, + { + "epoch": 1.4530529091018036, + "grad_norm": 0.31810933351516724, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0175, + "step": 24250 + }, + { + "epoch": 1.453652106177722, + "grad_norm": 0.20725348591804504, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0139, + "step": 24260 + }, + { + "epoch": 1.4542513032536402, + "grad_norm": 0.29788675904273987, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0153, + "step": 24270 + }, + { + "epoch": 1.4548505003295584, + "grad_norm": 0.286422997713089, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0171, + "step": 24280 + }, + { + "epoch": 1.4554496974054767, + "grad_norm": 0.31199127435684204, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0183, + "step": 24290 + }, + { + "epoch": 1.456048894481395, + "grad_norm": 0.5850293040275574, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0199, + "step": 24300 + }, + { + "epoch": 1.4566480915573132, + "grad_norm": 0.5558650493621826, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0153, + "step": 24310 + }, + { + "epoch": 1.4572472886332315, + "grad_norm": 0.5221429467201233, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0159, + "step": 24320 + }, + { + "epoch": 1.4578464857091498, + "grad_norm": 0.40443119406700134, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0176, + "step": 24330 + }, + { + "epoch": 1.458445682785068, + "grad_norm": 0.4657982289791107, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0183, + "step": 24340 + }, + { + "epoch": 1.4590448798609863, + "grad_norm": 0.23784635961055756, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0156, + "step": 24350 + }, + { + "epoch": 1.4596440769369046, + "grad_norm": 0.3992721438407898, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0223, + "step": 24360 + }, + { + "epoch": 1.4602432740128228, + "grad_norm": 0.3949171304702759, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.014, + "step": 24370 + }, + { + "epoch": 1.460842471088741, + "grad_norm": 0.33738628029823303, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0157, + "step": 24380 + }, + { + "epoch": 1.4614416681646594, + "grad_norm": 0.42644673585891724, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0153, + "step": 24390 + }, + { + "epoch": 1.4620408652405776, + "grad_norm": 0.25812193751335144, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0173, + "step": 24400 + }, + { + "epoch": 1.462640062316496, + "grad_norm": 0.29154765605926514, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0162, + "step": 24410 + }, + { + "epoch": 1.4632392593924142, + "grad_norm": 0.3526030480861664, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0182, + "step": 24420 + }, + { + "epoch": 1.4638384564683324, + "grad_norm": 0.731890857219696, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0166, + "step": 24430 + }, + { + "epoch": 1.4644376535442507, + "grad_norm": 0.34727898240089417, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0146, + "step": 24440 + }, + { + "epoch": 1.465036850620169, + "grad_norm": 0.4517475962638855, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0229, + "step": 24450 + }, + { + "epoch": 1.4656360476960872, + "grad_norm": 0.3026634156703949, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0174, + "step": 24460 + }, + { + "epoch": 1.4662352447720055, + "grad_norm": 0.20546412467956543, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0177, + "step": 24470 + }, + { + "epoch": 1.4668344418479238, + "grad_norm": 0.47296327352523804, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0176, + "step": 24480 + }, + { + "epoch": 1.467433638923842, + "grad_norm": 0.4550913870334625, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0167, + "step": 24490 + }, + { + "epoch": 1.4680328359997603, + "grad_norm": 0.38641592860221863, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0144, + "step": 24500 + }, + { + "epoch": 1.4686320330756786, + "grad_norm": 0.23746857047080994, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0167, + "step": 24510 + }, + { + "epoch": 1.4692312301515968, + "grad_norm": 0.2114812433719635, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0167, + "step": 24520 + }, + { + "epoch": 1.4698304272275151, + "grad_norm": 0.41703343391418457, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.014, + "step": 24530 + }, + { + "epoch": 1.4704296243034334, + "grad_norm": 0.3279412090778351, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0158, + "step": 24540 + }, + { + "epoch": 1.4710288213793516, + "grad_norm": 0.41653862595558167, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0191, + "step": 24550 + }, + { + "epoch": 1.47162801845527, + "grad_norm": 0.5392111539840698, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0151, + "step": 24560 + }, + { + "epoch": 1.4722272155311882, + "grad_norm": 0.4654570519924164, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0179, + "step": 24570 + }, + { + "epoch": 1.4728264126071064, + "grad_norm": 0.5389031171798706, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0139, + "step": 24580 + }, + { + "epoch": 1.4734256096830247, + "grad_norm": 0.38597020506858826, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0142, + "step": 24590 + }, + { + "epoch": 1.474024806758943, + "grad_norm": 0.4820668399333954, + "learning_rate": 3.497061149826966e-06, + "loss": 0.015, + "step": 24600 + }, + { + "epoch": 1.4746240038348613, + "grad_norm": 0.36856982111930847, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0141, + "step": 24610 + }, + { + "epoch": 1.4752232009107795, + "grad_norm": 0.39727091789245605, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0173, + "step": 24620 + }, + { + "epoch": 1.4758223979866978, + "grad_norm": 0.29800575971603394, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.017, + "step": 24630 + }, + { + "epoch": 1.476421595062616, + "grad_norm": 0.6900123357772827, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0172, + "step": 24640 + }, + { + "epoch": 1.4770207921385343, + "grad_norm": 0.2665303647518158, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0157, + "step": 24650 + }, + { + "epoch": 1.4776199892144526, + "grad_norm": 0.3223106265068054, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.016, + "step": 24660 + }, + { + "epoch": 1.4782191862903709, + "grad_norm": 0.3684261739253998, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.02, + "step": 24670 + }, + { + "epoch": 1.4788183833662891, + "grad_norm": 0.38197198510169983, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0169, + "step": 24680 + }, + { + "epoch": 1.4794175804422074, + "grad_norm": 0.35841095447540283, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0175, + "step": 24690 + }, + { + "epoch": 1.4800167775181257, + "grad_norm": 0.4376572370529175, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0156, + "step": 24700 + }, + { + "epoch": 1.480615974594044, + "grad_norm": 0.5526829361915588, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0147, + "step": 24710 + }, + { + "epoch": 1.4812151716699622, + "grad_norm": 0.2922399938106537, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0152, + "step": 24720 + }, + { + "epoch": 1.4818143687458805, + "grad_norm": 0.4333120882511139, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0147, + "step": 24730 + }, + { + "epoch": 1.4824135658217987, + "grad_norm": 0.26118189096450806, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0166, + "step": 24740 + }, + { + "epoch": 1.483012762897717, + "grad_norm": 0.35313257575035095, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.016, + "step": 24750 + }, + { + "epoch": 1.4836119599736353, + "grad_norm": 0.29923367500305176, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0201, + "step": 24760 + }, + { + "epoch": 1.4842111570495535, + "grad_norm": 0.434772253036499, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0193, + "step": 24770 + }, + { + "epoch": 1.4848103541254718, + "grad_norm": 0.3422386646270752, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0148, + "step": 24780 + }, + { + "epoch": 1.48540955120139, + "grad_norm": 0.4303880035877228, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0197, + "step": 24790 + }, + { + "epoch": 1.4860087482773083, + "grad_norm": 0.4511156976222992, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0172, + "step": 24800 + }, + { + "epoch": 1.4866079453532266, + "grad_norm": 0.22014041244983673, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0146, + "step": 24810 + }, + { + "epoch": 1.4872071424291449, + "grad_norm": 0.4387083351612091, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0173, + "step": 24820 + }, + { + "epoch": 1.4878063395050631, + "grad_norm": 0.44642165303230286, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0175, + "step": 24830 + }, + { + "epoch": 1.4884055365809814, + "grad_norm": 0.39087313413619995, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0158, + "step": 24840 + }, + { + "epoch": 1.4890047336568997, + "grad_norm": 0.42447686195373535, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0133, + "step": 24850 + }, + { + "epoch": 1.4896039307328182, + "grad_norm": 0.43447887897491455, + "learning_rate": 3.36005636574796e-06, + "loss": 0.017, + "step": 24860 + }, + { + "epoch": 1.4902031278087362, + "grad_norm": 0.3336028754711151, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0159, + "step": 24870 + }, + { + "epoch": 1.4908023248846547, + "grad_norm": 0.3250858187675476, + "learning_rate": 3.349767211300933e-06, + "loss": 0.0169, + "step": 24880 + }, + { + "epoch": 1.4914015219605727, + "grad_norm": 0.2616746425628662, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0138, + "step": 24890 + }, + { + "epoch": 1.4920007190364912, + "grad_norm": 0.2752698063850403, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0144, + "step": 24900 + }, + { + "epoch": 1.4925999161124093, + "grad_norm": 0.28214627504348755, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0157, + "step": 24910 + }, + { + "epoch": 1.4931991131883278, + "grad_norm": 0.3839667737483978, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0153, + "step": 24920 + }, + { + "epoch": 1.4937983102642458, + "grad_norm": 0.29319512844085693, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0165, + "step": 24930 + }, + { + "epoch": 1.4943975073401643, + "grad_norm": 0.4219116270542145, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0173, + "step": 24940 + }, + { + "epoch": 1.4949967044160823, + "grad_norm": 0.4940520226955414, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0163, + "step": 24950 + }, + { + "epoch": 1.4955959014920008, + "grad_norm": 0.40064749121665955, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0146, + "step": 24960 + }, + { + "epoch": 1.4961950985679189, + "grad_norm": 0.33400869369506836, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0165, + "step": 24970 + }, + { + "epoch": 1.4967942956438374, + "grad_norm": 0.2474612295627594, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0154, + "step": 24980 + }, + { + "epoch": 1.4973934927197554, + "grad_norm": 0.32819071412086487, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0189, + "step": 24990 + }, + { + "epoch": 1.497992689795674, + "grad_norm": 0.32721251249313354, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0144, + "step": 25000 + }, + { + "epoch": 1.498591886871592, + "grad_norm": 0.4054602086544037, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.017, + "step": 25010 + }, + { + "epoch": 1.4991910839475104, + "grad_norm": 0.4691202938556671, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0188, + "step": 25020 + }, + { + "epoch": 1.4997902810234285, + "grad_norm": 0.9318768382072449, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0192, + "step": 25030 + }, + { + "epoch": 1.500389478099347, + "grad_norm": 0.25441330671310425, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0143, + "step": 25040 + }, + { + "epoch": 1.500988675175265, + "grad_norm": 0.3425164520740509, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0152, + "step": 25050 + }, + { + "epoch": 1.5015878722511835, + "grad_norm": 0.3809274733066559, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0154, + "step": 25060 + }, + { + "epoch": 1.5021870693271016, + "grad_norm": 0.2595506012439728, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0144, + "step": 25070 + }, + { + "epoch": 1.50278626640302, + "grad_norm": 0.29121503233909607, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0131, + "step": 25080 + }, + { + "epoch": 1.503385463478938, + "grad_norm": 0.2435981184244156, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0169, + "step": 25090 + }, + { + "epoch": 1.5039846605548566, + "grad_norm": 0.2967667579650879, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0146, + "step": 25100 + }, + { + "epoch": 1.5045838576307746, + "grad_norm": 0.2658415138721466, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0164, + "step": 25110 + }, + { + "epoch": 1.5051830547066931, + "grad_norm": 0.25294387340545654, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0137, + "step": 25120 + }, + { + "epoch": 1.5057822517826112, + "grad_norm": 0.4117964208126068, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0175, + "step": 25130 + }, + { + "epoch": 1.5063814488585296, + "grad_norm": 0.22604988515377045, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0187, + "step": 25140 + }, + { + "epoch": 1.5069806459344477, + "grad_norm": 0.2773517668247223, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0176, + "step": 25150 + }, + { + "epoch": 1.5075798430103662, + "grad_norm": 0.3213720917701721, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0181, + "step": 25160 + }, + { + "epoch": 1.5081790400862842, + "grad_norm": 0.3932463526725769, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0169, + "step": 25170 + }, + { + "epoch": 1.5087782371622027, + "grad_norm": 0.27642500400543213, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0177, + "step": 25180 + }, + { + "epoch": 1.5093774342381208, + "grad_norm": 0.4212909936904907, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0165, + "step": 25190 + }, + { + "epoch": 1.5099766313140393, + "grad_norm": 0.31928038597106934, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0159, + "step": 25200 + }, + { + "epoch": 1.5105758283899573, + "grad_norm": 0.31685909628868103, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0146, + "step": 25210 + }, + { + "epoch": 1.5111750254658758, + "grad_norm": 0.22591470181941986, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0142, + "step": 25220 + }, + { + "epoch": 1.5117742225417938, + "grad_norm": 0.22344504296779633, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0127, + "step": 25230 + }, + { + "epoch": 1.5123734196177123, + "grad_norm": 0.4538969099521637, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.0174, + "step": 25240 + }, + { + "epoch": 1.5129726166936306, + "grad_norm": 0.35422542691230774, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0169, + "step": 25250 + }, + { + "epoch": 1.5135718137695489, + "grad_norm": 0.41911551356315613, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0157, + "step": 25260 + }, + { + "epoch": 1.5141710108454671, + "grad_norm": 0.4679270088672638, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0147, + "step": 25270 + }, + { + "epoch": 1.5147702079213854, + "grad_norm": 0.29286396503448486, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0168, + "step": 25280 + }, + { + "epoch": 1.5153694049973037, + "grad_norm": 0.2840272784233093, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0182, + "step": 25290 + }, + { + "epoch": 1.515968602073222, + "grad_norm": 0.3369516432285309, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0159, + "step": 25300 + }, + { + "epoch": 1.5165677991491402, + "grad_norm": 0.36810392141342163, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.0207, + "step": 25310 + }, + { + "epoch": 1.5171669962250585, + "grad_norm": 0.30844470858573914, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.0151, + "step": 25320 + }, + { + "epoch": 1.5177661933009767, + "grad_norm": 0.22359415888786316, + "learning_rate": 3.127844986891409e-06, + "loss": 0.018, + "step": 25330 + }, + { + "epoch": 1.518365390376895, + "grad_norm": 0.42099806666374207, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0158, + "step": 25340 + }, + { + "epoch": 1.5189645874528133, + "grad_norm": 0.2903825342655182, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0157, + "step": 25350 + }, + { + "epoch": 1.5195637845287315, + "grad_norm": 0.33182457089424133, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0179, + "step": 25360 + }, + { + "epoch": 1.5201629816046498, + "grad_norm": 0.4607376158237457, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0189, + "step": 25370 + }, + { + "epoch": 1.520762178680568, + "grad_norm": 0.21630525588989258, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0156, + "step": 25380 + }, + { + "epoch": 1.5213613757564863, + "grad_norm": 0.38443559408187866, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0174, + "step": 25390 + }, + { + "epoch": 1.5219605728324046, + "grad_norm": 0.19618573784828186, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0126, + "step": 25400 + }, + { + "epoch": 1.5225597699083229, + "grad_norm": 0.4141467809677124, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0147, + "step": 25410 + }, + { + "epoch": 1.5231589669842411, + "grad_norm": 0.39915844798088074, + "learning_rate": 3.085688933413021e-06, + "loss": 0.0156, + "step": 25420 + }, + { + "epoch": 1.5237581640601594, + "grad_norm": 0.25136515498161316, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0147, + "step": 25430 + }, + { + "epoch": 1.5243573611360777, + "grad_norm": 0.30357712507247925, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0153, + "step": 25440 + }, + { + "epoch": 1.524956558211996, + "grad_norm": 0.37422874569892883, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0146, + "step": 25450 + }, + { + "epoch": 1.5255557552879142, + "grad_norm": 0.19593080878257751, + "learning_rate": 3.067194157156521e-06, + "loss": 0.0185, + "step": 25460 + }, + { + "epoch": 1.5261549523638325, + "grad_norm": 0.4984768033027649, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0159, + "step": 25470 + }, + { + "epoch": 1.5267541494397507, + "grad_norm": 0.35011765360832214, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0171, + "step": 25480 + }, + { + "epoch": 1.527353346515669, + "grad_norm": 0.43658894300460815, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.014, + "step": 25490 + }, + { + "epoch": 1.5279525435915873, + "grad_norm": 0.3372974693775177, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0205, + "step": 25500 + }, + { + "epoch": 1.5285517406675055, + "grad_norm": 0.2942260205745697, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0166, + "step": 25510 + }, + { + "epoch": 1.5291509377434238, + "grad_norm": 0.43129920959472656, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0167, + "step": 25520 + }, + { + "epoch": 1.529750134819342, + "grad_norm": 0.3023529648780823, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0167, + "step": 25530 + }, + { + "epoch": 1.5303493318952603, + "grad_norm": 0.298043429851532, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0185, + "step": 25540 + }, + { + "epoch": 1.5309485289711786, + "grad_norm": 0.2765754461288452, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0134, + "step": 25550 + }, + { + "epoch": 1.5315477260470969, + "grad_norm": 0.43460533022880554, + "learning_rate": 3.021609639602321e-06, + "loss": 0.014, + "step": 25560 + }, + { + "epoch": 1.5321469231230151, + "grad_norm": 0.2843260169029236, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0155, + "step": 25570 + }, + { + "epoch": 1.5327461201989334, + "grad_norm": 0.3337956964969635, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0164, + "step": 25580 + }, + { + "epoch": 1.5333453172748517, + "grad_norm": 0.4841095805168152, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0147, + "step": 25590 + }, + { + "epoch": 1.53394451435077, + "grad_norm": 0.31032758951187134, + "learning_rate": 3.003637700546652e-06, + "loss": 0.015, + "step": 25600 + }, + { + "epoch": 1.5345437114266882, + "grad_norm": 0.4080669581890106, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0151, + "step": 25610 + }, + { + "epoch": 1.5351429085026065, + "grad_norm": 0.23705625534057617, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0136, + "step": 25620 + }, + { + "epoch": 1.5357421055785248, + "grad_norm": 0.5293036103248596, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0195, + "step": 25630 + }, + { + "epoch": 1.536341302654443, + "grad_norm": 0.19166356325149536, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0118, + "step": 25640 + }, + { + "epoch": 1.5369404997303613, + "grad_norm": 0.35923510789871216, + "learning_rate": 2.981383959667165e-06, + "loss": 0.0153, + "step": 25650 + }, + { + "epoch": 1.5375396968062796, + "grad_norm": 0.525636613368988, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0169, + "step": 25660 + }, + { + "epoch": 1.5381388938821978, + "grad_norm": 0.3833159804344177, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0155, + "step": 25670 + }, + { + "epoch": 1.538738090958116, + "grad_norm": 0.30203381180763245, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0163, + "step": 25680 + }, + { + "epoch": 1.5393372880340344, + "grad_norm": 0.5735456347465515, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0158, + "step": 25690 + }, + { + "epoch": 1.5399364851099526, + "grad_norm": 0.4676662087440491, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0195, + "step": 25700 + }, + { + "epoch": 1.540535682185871, + "grad_norm": 0.29208818078041077, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0165, + "step": 25710 + }, + { + "epoch": 1.5411348792617892, + "grad_norm": 0.3703807294368744, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.015, + "step": 25720 + }, + { + "epoch": 1.5417340763377074, + "grad_norm": 0.5645684003829956, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0192, + "step": 25730 + }, + { + "epoch": 1.5423332734136257, + "grad_norm": 0.5154808759689331, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0154, + "step": 25740 + }, + { + "epoch": 1.542932470489544, + "grad_norm": 0.49836722016334534, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0161, + "step": 25750 + }, + { + "epoch": 1.5435316675654622, + "grad_norm": 0.4711974561214447, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0143, + "step": 25760 + }, + { + "epoch": 1.5441308646413805, + "grad_norm": 0.3468717932701111, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0151, + "step": 25770 + }, + { + "epoch": 1.5447300617172988, + "grad_norm": 0.3216229975223541, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0146, + "step": 25780 + }, + { + "epoch": 1.5453292587932173, + "grad_norm": 0.3436613976955414, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.0172, + "step": 25790 + }, + { + "epoch": 1.5459284558691353, + "grad_norm": 0.3601810336112976, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0166, + "step": 25800 + }, + { + "epoch": 1.5465276529450538, + "grad_norm": 0.2320292890071869, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0156, + "step": 25810 + }, + { + "epoch": 1.5471268500209718, + "grad_norm": 0.4563167989253998, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0168, + "step": 25820 + }, + { + "epoch": 1.5477260470968903, + "grad_norm": 0.33735397458076477, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0165, + "step": 25830 + }, + { + "epoch": 1.5483252441728084, + "grad_norm": 0.41785505414009094, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0179, + "step": 25840 + }, + { + "epoch": 1.5489244412487269, + "grad_norm": 0.41172194480895996, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.016, + "step": 25850 + }, + { + "epoch": 1.549523638324645, + "grad_norm": 0.4549838900566101, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0134, + "step": 25860 + }, + { + "epoch": 1.5501228354005634, + "grad_norm": 0.6315169930458069, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0176, + "step": 25870 + }, + { + "epoch": 1.5507220324764814, + "grad_norm": 0.43143466114997864, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0166, + "step": 25880 + }, + { + "epoch": 1.5513212295524, + "grad_norm": 0.4559693932533264, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0162, + "step": 25890 + }, + { + "epoch": 1.551920426628318, + "grad_norm": 0.3333865702152252, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0168, + "step": 25900 + }, + { + "epoch": 1.5525196237042365, + "grad_norm": 0.3939986526966095, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0146, + "step": 25910 + }, + { + "epoch": 1.5531188207801545, + "grad_norm": 0.35824787616729736, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0179, + "step": 25920 + }, + { + "epoch": 1.553718017856073, + "grad_norm": 0.40517401695251465, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0158, + "step": 25930 + }, + { + "epoch": 1.554317214931991, + "grad_norm": 0.41149890422821045, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0142, + "step": 25940 + }, + { + "epoch": 1.5549164120079095, + "grad_norm": 0.22149957716464996, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0153, + "step": 25950 + }, + { + "epoch": 1.5555156090838276, + "grad_norm": 0.2622004747390747, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0142, + "step": 25960 + }, + { + "epoch": 1.556114806159746, + "grad_norm": 0.3235580623149872, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.014, + "step": 25970 + }, + { + "epoch": 1.5567140032356641, + "grad_norm": 0.4349730312824249, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0148, + "step": 25980 + }, + { + "epoch": 1.5573132003115826, + "grad_norm": 0.30583831667900085, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0148, + "step": 25990 + }, + { + "epoch": 1.5579123973875006, + "grad_norm": 0.3436671495437622, + "learning_rate": 2.832230653119002e-06, + "loss": 0.015, + "step": 26000 + }, + { + "epoch": 1.5585115944634191, + "grad_norm": 0.23681265115737915, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0169, + "step": 26010 + }, + { + "epoch": 1.5591107915393372, + "grad_norm": 0.2916300892829895, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.0145, + "step": 26020 + }, + { + "epoch": 1.5597099886152557, + "grad_norm": 0.4516601264476776, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.0168, + "step": 26030 + }, + { + "epoch": 1.5603091856911737, + "grad_norm": 0.25640442967414856, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0145, + "step": 26040 + }, + { + "epoch": 1.5609083827670922, + "grad_norm": 0.3058616816997528, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.0134, + "step": 26050 + }, + { + "epoch": 1.5615075798430103, + "grad_norm": 0.37286022305488586, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0133, + "step": 26060 + }, + { + "epoch": 1.5621067769189287, + "grad_norm": 0.2570302486419678, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.0136, + "step": 26070 + }, + { + "epoch": 1.5627059739948468, + "grad_norm": 0.5596319437026978, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0184, + "step": 26080 + }, + { + "epoch": 1.5633051710707653, + "grad_norm": 0.36270666122436523, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.0158, + "step": 26090 + }, + { + "epoch": 1.5639043681466833, + "grad_norm": 0.4473365247249603, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0146, + "step": 26100 + }, + { + "epoch": 1.5645035652226018, + "grad_norm": 0.256773978471756, + "learning_rate": 2.78776903555923e-06, + "loss": 0.0141, + "step": 26110 + }, + { + "epoch": 1.5651027622985199, + "grad_norm": 0.3173777759075165, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.0155, + "step": 26120 + }, + { + "epoch": 1.5657019593744383, + "grad_norm": 0.39649754762649536, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0171, + "step": 26130 + }, + { + "epoch": 1.5663011564503564, + "grad_norm": 0.8298602104187012, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.0181, + "step": 26140 + }, + { + "epoch": 1.5669003535262749, + "grad_norm": 0.41698411107063293, + "learning_rate": 2.771889969647e-06, + "loss": 0.0155, + "step": 26150 + }, + { + "epoch": 1.567499550602193, + "grad_norm": 0.3315671384334564, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0142, + "step": 26160 + }, + { + "epoch": 1.5680987476781114, + "grad_norm": 0.27380600571632385, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0146, + "step": 26170 + }, + { + "epoch": 1.5686979447540295, + "grad_norm": 0.2785346210002899, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0174, + "step": 26180 + }, + { + "epoch": 1.569297141829948, + "grad_norm": 0.46294671297073364, + "learning_rate": 2.756165401834933e-06, + "loss": 0.0177, + "step": 26190 + }, + { + "epoch": 1.569896338905866, + "grad_norm": 0.3026588559150696, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.016, + "step": 26200 + }, + { + "epoch": 1.5704955359817845, + "grad_norm": 0.335443377494812, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0138, + "step": 26210 + }, + { + "epoch": 1.5710947330577025, + "grad_norm": 0.26176130771636963, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0182, + "step": 26220 + }, + { + "epoch": 1.571693930133621, + "grad_norm": 0.41030630469322205, + "learning_rate": 2.740595627381096e-06, + "loss": 0.0157, + "step": 26230 + }, + { + "epoch": 1.572293127209539, + "grad_norm": 0.25381243228912354, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0126, + "step": 26240 + }, + { + "epoch": 1.5728923242854576, + "grad_norm": 0.3790159821510315, + "learning_rate": 2.732868879137055e-06, + "loss": 0.0138, + "step": 26250 + }, + { + "epoch": 1.5734915213613756, + "grad_norm": 0.3830420672893524, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.0134, + "step": 26260 + }, + { + "epoch": 1.574090718437294, + "grad_norm": 0.534146785736084, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0151, + "step": 26270 + }, + { + "epoch": 1.5746899155132121, + "grad_norm": 0.5088993310928345, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0188, + "step": 26280 + }, + { + "epoch": 1.5752891125891306, + "grad_norm": 0.271245539188385, + "learning_rate": 2.717531841969889e-06, + "loss": 0.015, + "step": 26290 + }, + { + "epoch": 1.5758883096650487, + "grad_norm": 0.7041701078414917, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0205, + "step": 26300 + }, + { + "epoch": 1.5764875067409672, + "grad_norm": 1.5670353174209595, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0246, + "step": 26310 + }, + { + "epoch": 1.5770867038168854, + "grad_norm": 0.3782089054584503, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.0145, + "step": 26320 + }, + { + "epoch": 1.5776859008928037, + "grad_norm": 0.2301669716835022, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0132, + "step": 26330 + }, + { + "epoch": 1.578285097968722, + "grad_norm": 0.4629409611225128, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.013, + "step": 26340 + }, + { + "epoch": 1.5788842950446402, + "grad_norm": 0.2709483802318573, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0154, + "step": 26350 + }, + { + "epoch": 1.5794834921205585, + "grad_norm": 0.31532853841781616, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.016, + "step": 26360 + }, + { + "epoch": 1.5800826891964768, + "grad_norm": 0.350920170545578, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.016, + "step": 26370 + }, + { + "epoch": 1.580681886272395, + "grad_norm": 0.5954864025115967, + "learning_rate": 2.683592557860616e-06, + "loss": 0.0178, + "step": 26380 + }, + { + "epoch": 1.5812810833483133, + "grad_norm": 0.4362819492816925, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.017, + "step": 26390 + }, + { + "epoch": 1.5818802804242316, + "grad_norm": 0.2640637755393982, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.0146, + "step": 26400 + }, + { + "epoch": 1.5824794775001498, + "grad_norm": 0.475008100271225, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0134, + "step": 26410 + }, + { + "epoch": 1.583078674576068, + "grad_norm": 0.27583909034729004, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.014, + "step": 26420 + }, + { + "epoch": 1.5836778716519864, + "grad_norm": 0.392715722322464, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0183, + "step": 26430 + }, + { + "epoch": 1.5842770687279046, + "grad_norm": 0.19658122956752777, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0172, + "step": 26440 + }, + { + "epoch": 1.584876265803823, + "grad_norm": 0.8701423406600952, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.015, + "step": 26450 + }, + { + "epoch": 1.5854754628797412, + "grad_norm": 0.9331104159355164, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0152, + "step": 26460 + }, + { + "epoch": 1.5860746599556594, + "grad_norm": 0.29767271876335144, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.0143, + "step": 26470 + }, + { + "epoch": 1.5866738570315777, + "grad_norm": 0.3449382781982422, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.0151, + "step": 26480 + }, + { + "epoch": 1.587273054107496, + "grad_norm": 0.26225733757019043, + "learning_rate": 2.643185095075473e-06, + "loss": 0.0143, + "step": 26490 + }, + { + "epoch": 1.5878722511834142, + "grad_norm": 0.3581456243991852, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0138, + "step": 26500 + }, + { + "epoch": 1.5884714482593325, + "grad_norm": 0.246829554438591, + "learning_rate": 2.635965610168249e-06, + "loss": 0.0178, + "step": 26510 + }, + { + "epoch": 1.5890706453352508, + "grad_norm": 0.317020446062088, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0157, + "step": 26520 + }, + { + "epoch": 1.589669842411169, + "grad_norm": 0.3022174537181854, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0132, + "step": 26530 + }, + { + "epoch": 1.5902690394870873, + "grad_norm": 0.26253461837768555, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0135, + "step": 26540 + }, + { + "epoch": 1.5908682365630056, + "grad_norm": 0.2757222056388855, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0138, + "step": 26550 + }, + { + "epoch": 1.5914674336389238, + "grad_norm": 0.3857184052467346, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.013, + "step": 26560 + }, + { + "epoch": 1.5920666307148421, + "grad_norm": 0.4407658576965332, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0185, + "step": 26570 + }, + { + "epoch": 1.5926658277907604, + "grad_norm": 0.3413793444633484, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0135, + "step": 26580 + }, + { + "epoch": 1.5932650248666786, + "grad_norm": 0.24001765251159668, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0146, + "step": 26590 + }, + { + "epoch": 1.593864221942597, + "grad_norm": 0.4623468518257141, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.015, + "step": 26600 + }, + { + "epoch": 1.5944634190185152, + "grad_norm": 0.32984790205955505, + "learning_rate": 2.600457796416397e-06, + "loss": 0.0159, + "step": 26610 + }, + { + "epoch": 1.5950626160944334, + "grad_norm": 0.31533241271972656, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.0157, + "step": 26620 + }, + { + "epoch": 1.5956618131703517, + "grad_norm": 0.3851890563964844, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0168, + "step": 26630 + }, + { + "epoch": 1.59626101024627, + "grad_norm": 0.41252562403678894, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0165, + "step": 26640 + }, + { + "epoch": 1.5968602073221883, + "grad_norm": 0.473445326089859, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0123, + "step": 26650 + }, + { + "epoch": 1.5974594043981065, + "grad_norm": 0.3054860532283783, + "learning_rate": 2.583073279935805e-06, + "loss": 0.014, + "step": 26660 + }, + { + "epoch": 1.5980586014740248, + "grad_norm": 0.28879237174987793, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.0171, + "step": 26670 + }, + { + "epoch": 1.598657798549943, + "grad_norm": 0.32456526160240173, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0143, + "step": 26680 + }, + { + "epoch": 1.5992569956258613, + "grad_norm": 0.5708281993865967, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0163, + "step": 26690 + }, + { + "epoch": 1.5998561927017796, + "grad_norm": 0.6487006545066833, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.0169, + "step": 26700 + }, + { + "epoch": 1.6004553897776979, + "grad_norm": 0.3364347517490387, + "learning_rate": 2.565935706183804e-06, + "loss": 0.018, + "step": 26710 + }, + { + "epoch": 1.6010545868536161, + "grad_norm": 0.41275516152381897, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0153, + "step": 26720 + }, + { + "epoch": 1.6016537839295344, + "grad_norm": 0.391722708940506, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0161, + "step": 26730 + }, + { + "epoch": 1.6022529810054527, + "grad_norm": 0.3787323534488678, + "learning_rate": 2.555771903907403e-06, + "loss": 0.0174, + "step": 26740 + }, + { + "epoch": 1.602852178081371, + "grad_norm": 0.3075166940689087, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0145, + "step": 26750 + }, + { + "epoch": 1.6034513751572892, + "grad_norm": 0.3613744080066681, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0154, + "step": 26760 + }, + { + "epoch": 1.6040505722332075, + "grad_norm": 0.34713929891586304, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0114, + "step": 26770 + }, + { + "epoch": 1.6046497693091257, + "grad_norm": 0.4100549519062042, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.013, + "step": 26780 + }, + { + "epoch": 1.605248966385044, + "grad_norm": 0.3897320330142975, + "learning_rate": 2.5390304813179e-06, + "loss": 0.016, + "step": 26790 + }, + { + "epoch": 1.6058481634609623, + "grad_norm": 0.3584144413471222, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.015, + "step": 26800 + }, + { + "epoch": 1.6064473605368805, + "grad_norm": 0.31220853328704834, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0131, + "step": 26810 + }, + { + "epoch": 1.6070465576127988, + "grad_norm": 0.3192695379257202, + "learning_rate": 2.529104749380281e-06, + "loss": 0.0133, + "step": 26820 + }, + { + "epoch": 1.607645754688717, + "grad_norm": 0.30283334851264954, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0122, + "step": 26830 + }, + { + "epoch": 1.6082449517646353, + "grad_norm": 0.282143771648407, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0138, + "step": 26840 + }, + { + "epoch": 1.6088441488405536, + "grad_norm": 0.43043816089630127, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0155, + "step": 26850 + }, + { + "epoch": 1.609443345916472, + "grad_norm": 0.2672103941440582, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0153, + "step": 26860 + }, + { + "epoch": 1.6100425429923901, + "grad_norm": 0.39164942502975464, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0145, + "step": 26870 + }, + { + "epoch": 1.6106417400683086, + "grad_norm": 0.33121028542518616, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.014, + "step": 26880 + }, + { + "epoch": 1.6112409371442267, + "grad_norm": 0.46786385774612427, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0144, + "step": 26890 + }, + { + "epoch": 1.6118401342201452, + "grad_norm": 0.4348220229148865, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0136, + "step": 26900 + }, + { + "epoch": 1.6124393312960632, + "grad_norm": 0.7225855588912964, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0159, + "step": 26910 + }, + { + "epoch": 1.6130385283719817, + "grad_norm": 0.540884256362915, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0153, + "step": 26920 + }, + { + "epoch": 1.6136377254478997, + "grad_norm": 0.2984727919101715, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0127, + "step": 26930 + }, + { + "epoch": 1.6142369225238182, + "grad_norm": 0.34762996435165405, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0181, + "step": 26940 + }, + { + "epoch": 1.6148361195997363, + "grad_norm": 0.4229494035243988, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0184, + "step": 26950 + }, + { + "epoch": 1.6154353166756548, + "grad_norm": 0.4511129558086395, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0136, + "step": 26960 + }, + { + "epoch": 1.6160345137515728, + "grad_norm": 0.20887398719787598, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0136, + "step": 26970 + }, + { + "epoch": 1.6166337108274913, + "grad_norm": 0.27858126163482666, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0128, + "step": 26980 + }, + { + "epoch": 1.6172329079034093, + "grad_norm": 0.32049617171287537, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.013, + "step": 26990 + }, + { + "epoch": 1.6178321049793278, + "grad_norm": 0.4276943802833557, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0152, + "step": 27000 + }, + { + "epoch": 1.6184313020552459, + "grad_norm": 0.29610252380371094, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.0122, + "step": 27010 + }, + { + "epoch": 1.6190304991311644, + "grad_norm": 0.24043124914169312, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0116, + "step": 27020 + }, + { + "epoch": 1.6196296962070824, + "grad_norm": 0.33894526958465576, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0119, + "step": 27030 + }, + { + "epoch": 1.620228893283001, + "grad_norm": 0.2597903609275818, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.0144, + "step": 27040 + }, + { + "epoch": 1.620828090358919, + "grad_norm": 0.4067903459072113, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.0137, + "step": 27050 + }, + { + "epoch": 1.6214272874348374, + "grad_norm": 0.48484402894973755, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.0147, + "step": 27060 + }, + { + "epoch": 1.6220264845107555, + "grad_norm": 0.52725750207901, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.0175, + "step": 27070 + }, + { + "epoch": 1.622625681586674, + "grad_norm": 0.23465880751609802, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0123, + "step": 27080 + }, + { + "epoch": 1.623224878662592, + "grad_norm": 0.4273434579372406, + "learning_rate": 2.443811559007335e-06, + "loss": 0.015, + "step": 27090 + }, + { + "epoch": 1.6238240757385105, + "grad_norm": 0.2985517680644989, + "learning_rate": 2.440792688039862e-06, + "loss": 0.013, + "step": 27100 + }, + { + "epoch": 1.6244232728144286, + "grad_norm": 0.4334832727909088, + "learning_rate": 2.437783861778914e-06, + "loss": 0.0113, + "step": 27110 + }, + { + "epoch": 1.625022469890347, + "grad_norm": 0.2899027466773987, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.0153, + "step": 27120 + }, + { + "epoch": 1.625621666966265, + "grad_norm": 0.35197123885154724, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0143, + "step": 27130 + }, + { + "epoch": 1.6262208640421836, + "grad_norm": 0.25402888655662537, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0152, + "step": 27140 + }, + { + "epoch": 1.6268200611181016, + "grad_norm": 0.49205178022384644, + "learning_rate": 2.425849074243997e-06, + "loss": 0.014, + "step": 27150 + }, + { + "epoch": 1.6274192581940201, + "grad_norm": 0.2541142404079437, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0134, + "step": 27160 + }, + { + "epoch": 1.6280184552699382, + "grad_norm": 0.4348624646663666, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0126, + "step": 27170 + }, + { + "epoch": 1.6286176523458566, + "grad_norm": 0.33341577649116516, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0149, + "step": 27180 + }, + { + "epoch": 1.6292168494217747, + "grad_norm": 0.394909143447876, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0146, + "step": 27190 + }, + { + "epoch": 1.6298160464976932, + "grad_norm": 0.47289931774139404, + "learning_rate": 2.411157015949963e-06, + "loss": 0.0165, + "step": 27200 + }, + { + "epoch": 1.6304152435736112, + "grad_norm": 0.45220911502838135, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0116, + "step": 27210 + }, + { + "epoch": 1.6310144406495297, + "grad_norm": 0.36566999554634094, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0158, + "step": 27220 + }, + { + "epoch": 1.6316136377254478, + "grad_norm": 0.26231661438941956, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0131, + "step": 27230 + }, + { + "epoch": 1.6322128348013663, + "grad_norm": 0.32366135716438293, + "learning_rate": 2.399584779188417e-06, + "loss": 0.0131, + "step": 27240 + }, + { + "epoch": 1.6328120318772843, + "grad_norm": 0.3068046271800995, + "learning_rate": 2.396716944205467e-06, + "loss": 0.0123, + "step": 27250 + }, + { + "epoch": 1.6334112289532028, + "grad_norm": 0.28027409315109253, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.013, + "step": 27260 + }, + { + "epoch": 1.6340104260291208, + "grad_norm": 0.3580668270587921, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0133, + "step": 27270 + }, + { + "epoch": 1.6346096231050393, + "grad_norm": 0.42907601594924927, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0148, + "step": 27280 + }, + { + "epoch": 1.6352088201809574, + "grad_norm": 0.2437274307012558, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.014, + "step": 27290 + }, + { + "epoch": 1.6358080172568759, + "grad_norm": 0.3689195513725281, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0113, + "step": 27300 + }, + { + "epoch": 1.636407214332794, + "grad_norm": 0.48261409997940063, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0141, + "step": 27310 + }, + { + "epoch": 1.6370064114087124, + "grad_norm": 0.3526110351085663, + "learning_rate": 2.376924986395271e-06, + "loss": 0.018, + "step": 27320 + }, + { + "epoch": 1.6376056084846304, + "grad_norm": 0.23795528709888458, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0154, + "step": 27330 + }, + { + "epoch": 1.638204805560549, + "grad_norm": 0.40328165888786316, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0128, + "step": 27340 + }, + { + "epoch": 1.638804002636467, + "grad_norm": 0.4420272409915924, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0137, + "step": 27350 + }, + { + "epoch": 1.6394031997123855, + "grad_norm": 0.23652666807174683, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.014, + "step": 27360 + }, + { + "epoch": 1.6400023967883035, + "grad_norm": 0.3468151390552521, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0155, + "step": 27370 + }, + { + "epoch": 1.640601593864222, + "grad_norm": 0.35930299758911133, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.014, + "step": 27380 + }, + { + "epoch": 1.6412007909401403, + "grad_norm": 0.19394037127494812, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0148, + "step": 27390 + }, + { + "epoch": 1.6417999880160585, + "grad_norm": 0.35877296328544617, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.0136, + "step": 27400 + }, + { + "epoch": 1.6423991850919768, + "grad_norm": 0.29156941175460815, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.0128, + "step": 27410 + }, + { + "epoch": 1.642998382167895, + "grad_norm": 0.3780912756919861, + "learning_rate": 2.349511203900333e-06, + "loss": 0.015, + "step": 27420 + }, + { + "epoch": 1.6435975792438133, + "grad_norm": 0.3290363848209381, + "learning_rate": 2.3468256081258e-06, + "loss": 0.0152, + "step": 27430 + }, + { + "epoch": 1.6441967763197316, + "grad_norm": 0.5973288416862488, + "learning_rate": 2.344150167333397e-06, + "loss": 0.015, + "step": 27440 + }, + { + "epoch": 1.6447959733956499, + "grad_norm": 0.4506072402000427, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0146, + "step": 27450 + }, + { + "epoch": 1.6453951704715681, + "grad_norm": 0.32139888405799866, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0168, + "step": 27460 + }, + { + "epoch": 1.6459943675474864, + "grad_norm": 0.3994857370853424, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0152, + "step": 27470 + }, + { + "epoch": 1.6465935646234047, + "grad_norm": 0.26820749044418335, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0134, + "step": 27480 + }, + { + "epoch": 1.647192761699323, + "grad_norm": 0.3729577958583832, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0162, + "step": 27490 + }, + { + "epoch": 1.6477919587752412, + "grad_norm": 0.24220766127109528, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.0138, + "step": 27500 + }, + { + "epoch": 1.6483911558511595, + "grad_norm": 0.49408698081970215, + "learning_rate": 2.325706683525094e-06, + "loss": 0.017, + "step": 27510 + }, + { + "epoch": 1.6489903529270777, + "grad_norm": 0.22594054043293, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0148, + "step": 27520 + }, + { + "epoch": 1.649589550002996, + "grad_norm": 0.41143184900283813, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0152, + "step": 27530 + }, + { + "epoch": 1.6501887470789143, + "grad_norm": 0.3367273509502411, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0138, + "step": 27540 + }, + { + "epoch": 1.6507879441548325, + "grad_norm": 0.6019514203071594, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0168, + "step": 27550 + }, + { + "epoch": 1.6513871412307508, + "grad_norm": 0.5941750407218933, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.013, + "step": 27560 + }, + { + "epoch": 1.651986338306669, + "grad_norm": 0.43502920866012573, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0134, + "step": 27570 + }, + { + "epoch": 1.6525855353825873, + "grad_norm": 0.32287806272506714, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0131, + "step": 27580 + }, + { + "epoch": 1.6531847324585056, + "grad_norm": 0.4743358790874481, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0198, + "step": 27590 + }, + { + "epoch": 1.6537839295344239, + "grad_norm": 0.29685747623443604, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0147, + "step": 27600 + }, + { + "epoch": 1.6543831266103421, + "grad_norm": 0.4355921447277069, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0136, + "step": 27610 + }, + { + "epoch": 1.6549823236862604, + "grad_norm": 0.4096180498600006, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.013, + "step": 27620 + }, + { + "epoch": 1.6555815207621787, + "grad_norm": 0.3704766631126404, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.0152, + "step": 27630 + }, + { + "epoch": 1.656180717838097, + "grad_norm": 0.4177798628807068, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0147, + "step": 27640 + }, + { + "epoch": 1.6567799149140152, + "grad_norm": 0.32486793398857117, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0125, + "step": 27650 + }, + { + "epoch": 1.6573791119899335, + "grad_norm": 0.3335772752761841, + "learning_rate": 2.287865908463585e-06, + "loss": 0.0155, + "step": 27660 + }, + { + "epoch": 1.6579783090658518, + "grad_norm": 0.4169732332229614, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.0153, + "step": 27670 + }, + { + "epoch": 1.65857750614177, + "grad_norm": 0.2390674203634262, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0161, + "step": 27680 + }, + { + "epoch": 1.6591767032176883, + "grad_norm": 0.41580212116241455, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0116, + "step": 27690 + }, + { + "epoch": 1.6597759002936066, + "grad_norm": 0.3981385827064514, + "learning_rate": 2.278163146933236e-06, + "loss": 0.013, + "step": 27700 + }, + { + "epoch": 1.6603750973695248, + "grad_norm": 0.3737584948539734, + "learning_rate": 2.275763038367336e-06, + "loss": 0.011, + "step": 27710 + }, + { + "epoch": 1.660974294445443, + "grad_norm": 0.2370023876428604, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0173, + "step": 27720 + }, + { + "epoch": 1.6615734915213614, + "grad_norm": 0.6599531769752502, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0145, + "step": 27730 + }, + { + "epoch": 1.6621726885972796, + "grad_norm": 0.3255928158760071, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0149, + "step": 27740 + }, + { + "epoch": 1.662771885673198, + "grad_norm": 0.28063544631004333, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0157, + "step": 27750 + }, + { + "epoch": 1.6633710827491162, + "grad_norm": 0.300642192363739, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0139, + "step": 27760 + }, + { + "epoch": 1.6639702798250344, + "grad_norm": 0.3485228717327118, + "learning_rate": 2.261577490652103e-06, + "loss": 0.0139, + "step": 27770 + }, + { + "epoch": 1.6645694769009527, + "grad_norm": 0.31508076190948486, + "learning_rate": 2.259249109209093e-06, + "loss": 0.0162, + "step": 27780 + }, + { + "epoch": 1.665168673976871, + "grad_norm": 0.4764767587184906, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0145, + "step": 27790 + }, + { + "epoch": 1.6657678710527892, + "grad_norm": 0.26427552103996277, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.013, + "step": 27800 + }, + { + "epoch": 1.6663670681287075, + "grad_norm": 0.5152391791343689, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.015, + "step": 27810 + }, + { + "epoch": 1.6669662652046258, + "grad_norm": 0.4326762855052948, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0111, + "step": 27820 + }, + { + "epoch": 1.667565462280544, + "grad_norm": 0.3035188913345337, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0141, + "step": 27830 + }, + { + "epoch": 1.6681646593564623, + "grad_norm": 0.49474793672561646, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0129, + "step": 27840 + }, + { + "epoch": 1.6687638564323806, + "grad_norm": 0.46236565709114075, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.016, + "step": 27850 + }, + { + "epoch": 1.6693630535082988, + "grad_norm": 0.31711387634277344, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.013, + "step": 27860 + }, + { + "epoch": 1.669962250584217, + "grad_norm": 0.4073173701763153, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.0124, + "step": 27870 + }, + { + "epoch": 1.6705614476601354, + "grad_norm": 0.3320833742618561, + "learning_rate": 2.236529916369313e-06, + "loss": 0.0172, + "step": 27880 + }, + { + "epoch": 1.6711606447360536, + "grad_norm": 0.4608694314956665, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0143, + "step": 27890 + }, + { + "epoch": 1.671759841811972, + "grad_norm": 0.9055055975914001, + "learning_rate": 2.232109406453595e-06, + "loss": 0.017, + "step": 27900 + }, + { + "epoch": 1.6723590388878904, + "grad_norm": 0.19240455329418182, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0128, + "step": 27910 + }, + { + "epoch": 1.6729582359638084, + "grad_norm": 0.2756566107273102, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0157, + "step": 27920 + }, + { + "epoch": 1.673557433039727, + "grad_norm": 0.47067585587501526, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0132, + "step": 27930 + }, + { + "epoch": 1.674156630115645, + "grad_norm": 0.421377032995224, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0137, + "step": 27940 + }, + { + "epoch": 1.6747558271915635, + "grad_norm": 0.437125563621521, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0119, + "step": 27950 + }, + { + "epoch": 1.6753550242674815, + "grad_norm": 0.3617478311061859, + "learning_rate": 2.219094909262834e-06, + "loss": 0.0159, + "step": 27960 + }, + { + "epoch": 1.6759542213434, + "grad_norm": 0.39676180481910706, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0099, + "step": 27970 + }, + { + "epoch": 1.676553418419318, + "grad_norm": 0.24751955270767212, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.0121, + "step": 27980 + }, + { + "epoch": 1.6771526154952365, + "grad_norm": 0.5263744592666626, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0144, + "step": 27990 + }, + { + "epoch": 1.6777518125711546, + "grad_norm": 0.28027406334877014, + "learning_rate": 2.210624641425579e-06, + "loss": 0.0119, + "step": 28000 + }, + { + "epoch": 1.678351009647073, + "grad_norm": 0.37766972184181213, + "learning_rate": 2.208532855337684e-06, + "loss": 0.014, + "step": 28010 + }, + { + "epoch": 1.6789502067229911, + "grad_norm": 0.5175389051437378, + "learning_rate": 2.2064513865261646e-06, + "loss": 0.016, + "step": 28020 + }, + { + "epoch": 1.6795494037989096, + "grad_norm": 0.2620721459388733, + "learning_rate": 2.204380237433745e-06, + "loss": 0.0141, + "step": 28030 + }, + { + "epoch": 1.6801486008748276, + "grad_norm": 0.532120406627655, + "learning_rate": 2.202319410491029e-06, + "loss": 0.019, + "step": 28040 + }, + { + "epoch": 1.6807477979507461, + "grad_norm": 0.3872573971748352, + "learning_rate": 2.2002689081165155e-06, + "loss": 0.013, + "step": 28050 + }, + { + "epoch": 1.6813469950266642, + "grad_norm": 0.5482046008110046, + "learning_rate": 2.1982287327165827e-06, + "loss": 0.0121, + "step": 28060 + }, + { + "epoch": 1.6819461921025827, + "grad_norm": 0.2698966860771179, + "learning_rate": 2.19619888668549e-06, + "loss": 0.0154, + "step": 28070 + }, + { + "epoch": 1.6825453891785007, + "grad_norm": 0.5507254004478455, + "learning_rate": 2.1941793724053733e-06, + "loss": 0.0159, + "step": 28080 + }, + { + "epoch": 1.6831445862544192, + "grad_norm": 0.2223939299583435, + "learning_rate": 2.1921701922462463e-06, + "loss": 0.0165, + "step": 28090 + }, + { + "epoch": 1.6837437833303373, + "grad_norm": 0.2616906762123108, + "learning_rate": 2.190171348565994e-06, + "loss": 0.0162, + "step": 28100 + }, + { + "epoch": 1.6843429804062557, + "grad_norm": 0.23425602912902832, + "learning_rate": 2.188182843710369e-06, + "loss": 0.0142, + "step": 28110 + }, + { + "epoch": 1.6849421774821738, + "grad_norm": 0.6166255474090576, + "learning_rate": 2.1862046800129964e-06, + "loss": 0.0113, + "step": 28120 + }, + { + "epoch": 1.6855413745580923, + "grad_norm": 0.5097243189811707, + "learning_rate": 2.1842368597953578e-06, + "loss": 0.0117, + "step": 28130 + }, + { + "epoch": 1.6861405716340103, + "grad_norm": 0.371737003326416, + "learning_rate": 2.1822793853668e-06, + "loss": 0.0145, + "step": 28140 + }, + { + "epoch": 1.6867397687099288, + "grad_norm": 0.4312809407711029, + "learning_rate": 2.18033225902453e-06, + "loss": 0.0187, + "step": 28150 + }, + { + "epoch": 1.6873389657858469, + "grad_norm": 0.22457192838191986, + "learning_rate": 2.17839548305361e-06, + "loss": 0.0166, + "step": 28160 + }, + { + "epoch": 1.6879381628617653, + "grad_norm": 0.380092978477478, + "learning_rate": 2.1764690597269507e-06, + "loss": 0.0147, + "step": 28170 + }, + { + "epoch": 1.6885373599376834, + "grad_norm": 0.3026501536369324, + "learning_rate": 2.17455299130532e-06, + "loss": 0.0126, + "step": 28180 + }, + { + "epoch": 1.6891365570136019, + "grad_norm": 0.2680145800113678, + "learning_rate": 2.17264728003733e-06, + "loss": 0.0186, + "step": 28190 + }, + { + "epoch": 1.68973575408952, + "grad_norm": 0.36782440543174744, + "learning_rate": 2.17075192815944e-06, + "loss": 0.015, + "step": 28200 + }, + { + "epoch": 1.6903349511654384, + "grad_norm": 0.44267189502716064, + "learning_rate": 2.168866937895951e-06, + "loss": 0.0138, + "step": 28210 + }, + { + "epoch": 1.6909341482413565, + "grad_norm": 0.2974400818347931, + "learning_rate": 2.166992311459001e-06, + "loss": 0.0138, + "step": 28220 + }, + { + "epoch": 1.691533345317275, + "grad_norm": 0.30415791273117065, + "learning_rate": 2.1651280510485727e-06, + "loss": 0.0153, + "step": 28230 + }, + { + "epoch": 1.692132542393193, + "grad_norm": 0.3176470398902893, + "learning_rate": 2.163274158852476e-06, + "loss": 0.0117, + "step": 28240 + }, + { + "epoch": 1.6927317394691115, + "grad_norm": 0.4339515268802643, + "learning_rate": 2.1614306370463605e-06, + "loss": 0.0142, + "step": 28250 + }, + { + "epoch": 1.6933309365450295, + "grad_norm": 0.19751861691474915, + "learning_rate": 2.1595974877936977e-06, + "loss": 0.0141, + "step": 28260 + }, + { + "epoch": 1.693930133620948, + "grad_norm": 0.4287707507610321, + "learning_rate": 2.1577747132457933e-06, + "loss": 0.0125, + "step": 28270 + }, + { + "epoch": 1.694529330696866, + "grad_norm": 0.3044722378253937, + "learning_rate": 2.155962315541773e-06, + "loss": 0.0135, + "step": 28280 + }, + { + "epoch": 1.6951285277727846, + "grad_norm": 0.608513355255127, + "learning_rate": 2.154160296808588e-06, + "loss": 0.0142, + "step": 28290 + }, + { + "epoch": 1.6957277248487026, + "grad_norm": 0.48647579550743103, + "learning_rate": 2.1523686591610064e-06, + "loss": 0.0104, + "step": 28300 + }, + { + "epoch": 1.696326921924621, + "grad_norm": 0.2991415560245514, + "learning_rate": 2.1505874047016146e-06, + "loss": 0.0154, + "step": 28310 + }, + { + "epoch": 1.6969261190005391, + "grad_norm": 0.30744972825050354, + "learning_rate": 2.1488165355208147e-06, + "loss": 0.0132, + "step": 28320 + }, + { + "epoch": 1.6975253160764576, + "grad_norm": 0.31283605098724365, + "learning_rate": 2.14705605369682e-06, + "loss": 0.0134, + "step": 28330 + }, + { + "epoch": 1.6981245131523757, + "grad_norm": 0.31089895963668823, + "learning_rate": 2.145305961295655e-06, + "loss": 0.0114, + "step": 28340 + }, + { + "epoch": 1.6987237102282942, + "grad_norm": 0.33381298184394836, + "learning_rate": 2.143566260371149e-06, + "loss": 0.0119, + "step": 28350 + }, + { + "epoch": 1.6993229073042122, + "grad_norm": 0.3127349317073822, + "learning_rate": 2.141836952964938e-06, + "loss": 0.0123, + "step": 28360 + }, + { + "epoch": 1.6999221043801307, + "grad_norm": 0.2944924235343933, + "learning_rate": 2.1401180411064616e-06, + "loss": 0.0126, + "step": 28370 + }, + { + "epoch": 1.7005213014560487, + "grad_norm": 0.2658735513687134, + "learning_rate": 2.138409526812959e-06, + "loss": 0.0183, + "step": 28380 + }, + { + "epoch": 1.7011204985319672, + "grad_norm": 0.30414438247680664, + "learning_rate": 2.1367114120894663e-06, + "loss": 0.0155, + "step": 28390 + }, + { + "epoch": 1.7017196956078853, + "grad_norm": 0.2788392901420593, + "learning_rate": 2.1350236989288136e-06, + "loss": 0.0135, + "step": 28400 + }, + { + "epoch": 1.7023188926838038, + "grad_norm": 0.42318466305732727, + "learning_rate": 2.1333463893116294e-06, + "loss": 0.0145, + "step": 28410 + }, + { + "epoch": 1.7029180897597218, + "grad_norm": 0.3691503703594208, + "learning_rate": 2.131679485206329e-06, + "loss": 0.0153, + "step": 28420 + }, + { + "epoch": 1.7035172868356403, + "grad_norm": 0.39968568086624146, + "learning_rate": 2.130022988569117e-06, + "loss": 0.0112, + "step": 28430 + }, + { + "epoch": 1.7041164839115586, + "grad_norm": 0.5108732581138611, + "learning_rate": 2.128376901343984e-06, + "loss": 0.0121, + "step": 28440 + }, + { + "epoch": 1.7047156809874768, + "grad_norm": 0.1716325432062149, + "learning_rate": 2.1267412254627056e-06, + "loss": 0.0151, + "step": 28450 + }, + { + "epoch": 1.705314878063395, + "grad_norm": 0.21164365112781525, + "learning_rate": 2.1251159628448386e-06, + "loss": 0.0134, + "step": 28460 + }, + { + "epoch": 1.7059140751393134, + "grad_norm": 0.329767107963562, + "learning_rate": 2.1235011153977192e-06, + "loss": 0.0138, + "step": 28470 + }, + { + "epoch": 1.7065132722152316, + "grad_norm": 0.29405954480171204, + "learning_rate": 2.121896685016461e-06, + "loss": 0.0117, + "step": 28480 + }, + { + "epoch": 1.70711246929115, + "grad_norm": 0.3556554615497589, + "learning_rate": 2.1203026735839514e-06, + "loss": 0.0112, + "step": 28490 + }, + { + "epoch": 1.7077116663670682, + "grad_norm": 0.20903514325618744, + "learning_rate": 2.118719082970852e-06, + "loss": 0.015, + "step": 28500 + }, + { + "epoch": 1.7083108634429864, + "grad_norm": 0.3857610821723938, + "learning_rate": 2.1171459150355947e-06, + "loss": 0.0142, + "step": 28510 + }, + { + "epoch": 1.7089100605189047, + "grad_norm": 0.37805458903312683, + "learning_rate": 2.115583171624381e-06, + "loss": 0.0128, + "step": 28520 + }, + { + "epoch": 1.709509257594823, + "grad_norm": 0.31887349486351013, + "learning_rate": 2.114030854571176e-06, + "loss": 0.0147, + "step": 28530 + }, + { + "epoch": 1.7101084546707412, + "grad_norm": 0.21606838703155518, + "learning_rate": 2.1124889656977097e-06, + "loss": 0.0122, + "step": 28540 + }, + { + "epoch": 1.7107076517466595, + "grad_norm": 0.36150410771369934, + "learning_rate": 2.1109575068134756e-06, + "loss": 0.0155, + "step": 28550 + }, + { + "epoch": 1.7113068488225778, + "grad_norm": 0.41081342101097107, + "learning_rate": 2.1094364797157267e-06, + "loss": 0.0157, + "step": 28560 + }, + { + "epoch": 1.711906045898496, + "grad_norm": 0.30500170588493347, + "learning_rate": 2.107925886189472e-06, + "loss": 0.013, + "step": 28570 + }, + { + "epoch": 1.7125052429744143, + "grad_norm": 0.45380985736846924, + "learning_rate": 2.1064257280074763e-06, + "loss": 0.0141, + "step": 28580 + }, + { + "epoch": 1.7131044400503326, + "grad_norm": 0.3077009618282318, + "learning_rate": 2.1049360069302594e-06, + "loss": 0.0172, + "step": 28590 + }, + { + "epoch": 1.7137036371262508, + "grad_norm": 0.3113479018211365, + "learning_rate": 2.1034567247060926e-06, + "loss": 0.0151, + "step": 28600 + }, + { + "epoch": 1.7143028342021691, + "grad_norm": 0.4720151126384735, + "learning_rate": 2.1019878830709968e-06, + "loss": 0.0146, + "step": 28610 + }, + { + "epoch": 1.7149020312780874, + "grad_norm": 0.40217068791389465, + "learning_rate": 2.100529483748737e-06, + "loss": 0.0144, + "step": 28620 + }, + { + "epoch": 1.7155012283540056, + "grad_norm": 0.4546513557434082, + "learning_rate": 2.099081528450828e-06, + "loss": 0.0141, + "step": 28630 + }, + { + "epoch": 1.716100425429924, + "grad_norm": 0.4527282416820526, + "learning_rate": 2.097644018876524e-06, + "loss": 0.0112, + "step": 28640 + }, + { + "epoch": 1.7166996225058422, + "grad_norm": 0.34587305784225464, + "learning_rate": 2.096216956712826e-06, + "loss": 0.0108, + "step": 28650 + }, + { + "epoch": 1.7172988195817604, + "grad_norm": 0.37963685393333435, + "learning_rate": 2.0948003436344666e-06, + "loss": 0.0125, + "step": 28660 + }, + { + "epoch": 1.7178980166576787, + "grad_norm": 0.3598407208919525, + "learning_rate": 2.0933941813039244e-06, + "loss": 0.014, + "step": 28670 + }, + { + "epoch": 1.718497213733597, + "grad_norm": 0.40873903036117554, + "learning_rate": 2.091998471371406e-06, + "loss": 0.0123, + "step": 28680 + }, + { + "epoch": 1.7190964108095153, + "grad_norm": 0.27075979113578796, + "learning_rate": 2.0906132154748557e-06, + "loss": 0.0118, + "step": 28690 + }, + { + "epoch": 1.7196956078854335, + "grad_norm": 0.3349001109600067, + "learning_rate": 2.0892384152399504e-06, + "loss": 0.0163, + "step": 28700 + }, + { + "epoch": 1.7202948049613518, + "grad_norm": 0.2682032287120819, + "learning_rate": 2.0878740722800917e-06, + "loss": 0.0121, + "step": 28710 + }, + { + "epoch": 1.72089400203727, + "grad_norm": 0.45613598823547363, + "learning_rate": 2.086520188196413e-06, + "loss": 0.0139, + "step": 28720 + }, + { + "epoch": 1.7214931991131883, + "grad_norm": 0.4061899781227112, + "learning_rate": 2.085176764577774e-06, + "loss": 0.0167, + "step": 28730 + }, + { + "epoch": 1.7220923961891066, + "grad_norm": 0.24202635884284973, + "learning_rate": 2.083843803000755e-06, + "loss": 0.0158, + "step": 28740 + }, + { + "epoch": 1.7226915932650249, + "grad_norm": 0.44541120529174805, + "learning_rate": 2.0825213050296636e-06, + "loss": 0.0163, + "step": 28750 + }, + { + "epoch": 1.7232907903409431, + "grad_norm": 0.35003194212913513, + "learning_rate": 2.081209272216522e-06, + "loss": 0.0147, + "step": 28760 + }, + { + "epoch": 1.7238899874168614, + "grad_norm": 0.3613188564777374, + "learning_rate": 2.079907706101075e-06, + "loss": 0.0158, + "step": 28770 + }, + { + "epoch": 1.7244891844927797, + "grad_norm": 0.2081748992204666, + "learning_rate": 2.0786166082107833e-06, + "loss": 0.0119, + "step": 28780 + }, + { + "epoch": 1.725088381568698, + "grad_norm": 0.36700639128685, + "learning_rate": 2.0773359800608217e-06, + "loss": 0.0159, + "step": 28790 + }, + { + "epoch": 1.7256875786446162, + "grad_norm": 0.3384808599948883, + "learning_rate": 2.076065823154079e-06, + "loss": 0.0116, + "step": 28800 + }, + { + "epoch": 1.7262867757205345, + "grad_norm": 0.5698443055152893, + "learning_rate": 2.0748061389811543e-06, + "loss": 0.0135, + "step": 28810 + }, + { + "epoch": 1.7268859727964527, + "grad_norm": 0.22777511179447174, + "learning_rate": 2.073556929020357e-06, + "loss": 0.0136, + "step": 28820 + }, + { + "epoch": 1.727485169872371, + "grad_norm": 0.42319542169570923, + "learning_rate": 2.0723181947377057e-06, + "loss": 0.0138, + "step": 28830 + }, + { + "epoch": 1.7280843669482893, + "grad_norm": 0.48199185729026794, + "learning_rate": 2.0710899375869237e-06, + "loss": 0.0131, + "step": 28840 + }, + { + "epoch": 1.7286835640242075, + "grad_norm": 0.35982295870780945, + "learning_rate": 2.0698721590094387e-06, + "loss": 0.011, + "step": 28850 + }, + { + "epoch": 1.7292827611001258, + "grad_norm": 0.3580028712749481, + "learning_rate": 2.0686648604343824e-06, + "loss": 0.0177, + "step": 28860 + }, + { + "epoch": 1.729881958176044, + "grad_norm": 0.21845780313014984, + "learning_rate": 2.067468043278587e-06, + "loss": 0.0117, + "step": 28870 + }, + { + "epoch": 1.7304811552519623, + "grad_norm": 0.3009333908557892, + "learning_rate": 2.066281708946583e-06, + "loss": 0.0133, + "step": 28880 + }, + { + "epoch": 1.7310803523278806, + "grad_norm": 0.28064268827438354, + "learning_rate": 2.0651058588306007e-06, + "loss": 0.0118, + "step": 28890 + }, + { + "epoch": 1.7316795494037989, + "grad_norm": 0.2811881899833679, + "learning_rate": 2.063940494310565e-06, + "loss": 0.0127, + "step": 28900 + }, + { + "epoch": 1.7322787464797171, + "grad_norm": 0.25449663400650024, + "learning_rate": 2.062785616754097e-06, + "loss": 0.0152, + "step": 28910 + }, + { + "epoch": 1.7328779435556354, + "grad_norm": 0.41728776693344116, + "learning_rate": 2.0616412275165097e-06, + "loss": 0.0141, + "step": 28920 + }, + { + "epoch": 1.7334771406315537, + "grad_norm": 0.4925801753997803, + "learning_rate": 2.0605073279408063e-06, + "loss": 0.0114, + "step": 28930 + }, + { + "epoch": 1.734076337707472, + "grad_norm": 0.3441443145275116, + "learning_rate": 2.0593839193576833e-06, + "loss": 0.0175, + "step": 28940 + }, + { + "epoch": 1.7346755347833902, + "grad_norm": 0.598228931427002, + "learning_rate": 2.058271003085521e-06, + "loss": 0.0153, + "step": 28950 + }, + { + "epoch": 1.7352747318593085, + "grad_norm": 0.34356069564819336, + "learning_rate": 2.0571685804303905e-06, + "loss": 0.0126, + "step": 28960 + }, + { + "epoch": 1.7358739289352267, + "grad_norm": 0.2617851495742798, + "learning_rate": 2.0560766526860447e-06, + "loss": 0.0143, + "step": 28970 + }, + { + "epoch": 1.7364731260111452, + "grad_norm": 0.35475805401802063, + "learning_rate": 2.054995221133923e-06, + "loss": 0.0157, + "step": 28980 + }, + { + "epoch": 1.7370723230870633, + "grad_norm": 0.45460638403892517, + "learning_rate": 2.053924287043144e-06, + "loss": 0.0102, + "step": 28990 + }, + { + "epoch": 1.7376715201629818, + "grad_norm": 0.35972440242767334, + "learning_rate": 2.0528638516705106e-06, + "loss": 0.0137, + "step": 29000 + }, + { + "epoch": 1.7382707172388998, + "grad_norm": 0.3128221035003662, + "learning_rate": 2.051813916260501e-06, + "loss": 0.013, + "step": 29010 + }, + { + "epoch": 1.7388699143148183, + "grad_norm": 0.7588064670562744, + "learning_rate": 2.050774482045273e-06, + "loss": 0.0133, + "step": 29020 + }, + { + "epoch": 1.7394691113907363, + "grad_norm": 0.5074214935302734, + "learning_rate": 2.049745550244661e-06, + "loss": 0.016, + "step": 29030 + }, + { + "epoch": 1.7400683084666548, + "grad_norm": 0.48871514201164246, + "learning_rate": 2.0487271220661735e-06, + "loss": 0.0109, + "step": 29040 + }, + { + "epoch": 1.7406675055425729, + "grad_norm": 0.30255070328712463, + "learning_rate": 2.047719198704994e-06, + "loss": 0.0135, + "step": 29050 + }, + { + "epoch": 1.7412667026184914, + "grad_norm": 0.4563025236129761, + "learning_rate": 2.0467217813439762e-06, + "loss": 0.015, + "step": 29060 + }, + { + "epoch": 1.7418658996944094, + "grad_norm": 0.24640238285064697, + "learning_rate": 2.0457348711536426e-06, + "loss": 0.0137, + "step": 29070 + }, + { + "epoch": 1.742465096770328, + "grad_norm": 0.3724379241466522, + "learning_rate": 2.0447584692921894e-06, + "loss": 0.0141, + "step": 29080 + }, + { + "epoch": 1.743064293846246, + "grad_norm": 0.32838496565818787, + "learning_rate": 2.043792576905478e-06, + "loss": 0.0132, + "step": 29090 + }, + { + "epoch": 1.7436634909221644, + "grad_norm": 0.5715250968933105, + "learning_rate": 2.0428371951270394e-06, + "loss": 0.0125, + "step": 29100 + }, + { + "epoch": 1.7442626879980825, + "grad_norm": 0.29502353072166443, + "learning_rate": 2.0418923250780633e-06, + "loss": 0.0122, + "step": 29110 + }, + { + "epoch": 1.744861885074001, + "grad_norm": 0.2790152132511139, + "learning_rate": 2.0409579678674084e-06, + "loss": 0.0098, + "step": 29120 + }, + { + "epoch": 1.745461082149919, + "grad_norm": 0.9304683208465576, + "learning_rate": 2.040034124591597e-06, + "loss": 0.0146, + "step": 29130 + }, + { + "epoch": 1.7460602792258375, + "grad_norm": 0.26618465781211853, + "learning_rate": 2.039120796334809e-06, + "loss": 0.0128, + "step": 29140 + }, + { + "epoch": 1.7466594763017556, + "grad_norm": 0.28312423825263977, + "learning_rate": 2.0382179841688868e-06, + "loss": 0.0114, + "step": 29150 + }, + { + "epoch": 1.747258673377674, + "grad_norm": 0.30827805399894714, + "learning_rate": 2.0373256891533293e-06, + "loss": 0.0144, + "step": 29160 + }, + { + "epoch": 1.747857870453592, + "grad_norm": 0.29084426164627075, + "learning_rate": 2.0364439123352956e-06, + "loss": 0.0142, + "step": 29170 + }, + { + "epoch": 1.7484570675295106, + "grad_norm": 0.2825562655925751, + "learning_rate": 2.0355726547495998e-06, + "loss": 0.0129, + "step": 29180 + }, + { + "epoch": 1.7490562646054286, + "grad_norm": 0.5477129220962524, + "learning_rate": 2.034711917418711e-06, + "loss": 0.0149, + "step": 29190 + }, + { + "epoch": 1.7496554616813471, + "grad_norm": 0.27458444237709045, + "learning_rate": 2.033861701352752e-06, + "loss": 0.0131, + "step": 29200 + }, + { + "epoch": 1.7502546587572652, + "grad_norm": 0.5763506293296814, + "learning_rate": 2.0330220075494992e-06, + "loss": 0.012, + "step": 29210 + }, + { + "epoch": 1.7508538558331836, + "grad_norm": 0.29996973276138306, + "learning_rate": 2.0321928369943807e-06, + "loss": 0.0139, + "step": 29220 + }, + { + "epoch": 1.7514530529091017, + "grad_norm": 0.2447529435157776, + "learning_rate": 2.031374190660474e-06, + "loss": 0.0128, + "step": 29230 + }, + { + "epoch": 1.7520522499850202, + "grad_norm": 0.18921193480491638, + "learning_rate": 2.0305660695085054e-06, + "loss": 0.0132, + "step": 29240 + }, + { + "epoch": 1.7526514470609382, + "grad_norm": 0.35065901279449463, + "learning_rate": 2.0297684744868494e-06, + "loss": 0.0178, + "step": 29250 + }, + { + "epoch": 1.7532506441368567, + "grad_norm": 0.22698186337947845, + "learning_rate": 2.0289814065315306e-06, + "loss": 0.0131, + "step": 29260 + }, + { + "epoch": 1.7538498412127748, + "grad_norm": 0.7310769557952881, + "learning_rate": 2.0282048665662153e-06, + "loss": 0.0146, + "step": 29270 + }, + { + "epoch": 1.7544490382886933, + "grad_norm": 0.5522712469100952, + "learning_rate": 2.0274388555022176e-06, + "loss": 0.0135, + "step": 29280 + }, + { + "epoch": 1.7550482353646113, + "grad_norm": 0.29603326320648193, + "learning_rate": 2.0266833742384928e-06, + "loss": 0.016, + "step": 29290 + }, + { + "epoch": 1.7556474324405298, + "grad_norm": 0.3674398362636566, + "learning_rate": 2.0259384236616404e-06, + "loss": 0.0135, + "step": 29300 + }, + { + "epoch": 1.7562466295164478, + "grad_norm": 0.4478980302810669, + "learning_rate": 2.0252040046459022e-06, + "loss": 0.0178, + "step": 29310 + }, + { + "epoch": 1.7568458265923663, + "grad_norm": 0.32618647813796997, + "learning_rate": 2.02448011805316e-06, + "loss": 0.014, + "step": 29320 + }, + { + "epoch": 1.7574450236682844, + "grad_norm": 0.5377118587493896, + "learning_rate": 2.023766764732934e-06, + "loss": 0.0159, + "step": 29330 + }, + { + "epoch": 1.7580442207442029, + "grad_norm": 0.3777340352535248, + "learning_rate": 2.0230639455223853e-06, + "loss": 0.0143, + "step": 29340 + }, + { + "epoch": 1.758643417820121, + "grad_norm": 0.33518269658088684, + "learning_rate": 2.0223716612463095e-06, + "loss": 0.0149, + "step": 29350 + }, + { + "epoch": 1.7592426148960394, + "grad_norm": 0.3693374991416931, + "learning_rate": 2.0216899127171424e-06, + "loss": 0.0128, + "step": 29360 + }, + { + "epoch": 1.7598418119719574, + "grad_norm": 0.42809057235717773, + "learning_rate": 2.0210187007349534e-06, + "loss": 0.0168, + "step": 29370 + }, + { + "epoch": 1.760441009047876, + "grad_norm": 0.4278734028339386, + "learning_rate": 2.0203580260874474e-06, + "loss": 0.0125, + "step": 29380 + }, + { + "epoch": 1.761040206123794, + "grad_norm": 0.45604345202445984, + "learning_rate": 2.019707889549963e-06, + "loss": 0.0147, + "step": 29390 + }, + { + "epoch": 1.7616394031997125, + "grad_norm": 0.3464241921901703, + "learning_rate": 2.01906829188547e-06, + "loss": 0.015, + "step": 29400 + }, + { + "epoch": 1.7622386002756305, + "grad_norm": 0.28437861800193787, + "learning_rate": 2.018439233844574e-06, + "loss": 0.0138, + "step": 29410 + }, + { + "epoch": 1.762837797351549, + "grad_norm": 0.8128647208213806, + "learning_rate": 2.0178207161655087e-06, + "loss": 0.0146, + "step": 29420 + }, + { + "epoch": 1.763436994427467, + "grad_norm": 0.4243966341018677, + "learning_rate": 2.0172127395741398e-06, + "loss": 0.0138, + "step": 29430 + }, + { + "epoch": 1.7640361915033855, + "grad_norm": 0.23284584283828735, + "learning_rate": 2.0166153047839603e-06, + "loss": 0.0123, + "step": 29440 + }, + { + "epoch": 1.7646353885793036, + "grad_norm": 0.6289668083190918, + "learning_rate": 2.016028412496094e-06, + "loss": 0.0131, + "step": 29450 + }, + { + "epoch": 1.765234585655222, + "grad_norm": 0.26893526315689087, + "learning_rate": 2.015452063399292e-06, + "loss": 0.0144, + "step": 29460 + }, + { + "epoch": 1.7658337827311401, + "grad_norm": 0.31439170241355896, + "learning_rate": 2.014886258169932e-06, + "loss": 0.0105, + "step": 29470 + }, + { + "epoch": 1.7664329798070586, + "grad_norm": 0.3153708577156067, + "learning_rate": 2.014330997472017e-06, + "loss": 0.0118, + "step": 29480 + }, + { + "epoch": 1.7670321768829766, + "grad_norm": 0.25374165177345276, + "learning_rate": 2.013786281957177e-06, + "loss": 0.0143, + "step": 29490 + }, + { + "epoch": 1.7676313739588951, + "grad_norm": 0.43711739778518677, + "learning_rate": 2.0132521122646662e-06, + "loss": 0.0151, + "step": 29500 + }, + { + "epoch": 1.7682305710348134, + "grad_norm": 0.2920657992362976, + "learning_rate": 2.0127284890213623e-06, + "loss": 0.0142, + "step": 29510 + }, + { + "epoch": 1.7688297681107317, + "grad_norm": 0.45769479870796204, + "learning_rate": 2.012215412841767e-06, + "loss": 0.0118, + "step": 29520 + }, + { + "epoch": 1.76942896518665, + "grad_norm": 0.31419840455055237, + "learning_rate": 2.011712884328003e-06, + "loss": 0.0115, + "step": 29530 + }, + { + "epoch": 1.7700281622625682, + "grad_norm": 0.29443657398223877, + "learning_rate": 2.011220904069815e-06, + "loss": 0.0144, + "step": 29540 + }, + { + "epoch": 1.7706273593384865, + "grad_norm": 0.3117132782936096, + "learning_rate": 2.01073947264457e-06, + "loss": 0.0132, + "step": 29550 + }, + { + "epoch": 1.7712265564144047, + "grad_norm": 0.351385235786438, + "learning_rate": 2.0102685906172543e-06, + "loss": 0.0111, + "step": 29560 + }, + { + "epoch": 1.771825753490323, + "grad_norm": 0.27133694291114807, + "learning_rate": 2.009808258540475e-06, + "loss": 0.0154, + "step": 29570 + }, + { + "epoch": 1.7724249505662413, + "grad_norm": 0.30877798795700073, + "learning_rate": 2.009358476954456e-06, + "loss": 0.0093, + "step": 29580 + }, + { + "epoch": 1.7730241476421595, + "grad_norm": 0.2506785988807678, + "learning_rate": 2.008919246387043e-06, + "loss": 0.012, + "step": 29590 + }, + { + "epoch": 1.7736233447180778, + "grad_norm": 0.32467120885849, + "learning_rate": 2.0084905673536952e-06, + "loss": 0.0131, + "step": 29600 + }, + { + "epoch": 1.774222541793996, + "grad_norm": 0.22748734056949615, + "learning_rate": 2.0080724403574922e-06, + "loss": 0.0097, + "step": 29610 + }, + { + "epoch": 1.7748217388699143, + "grad_norm": 0.38346391916275024, + "learning_rate": 2.007664865889131e-06, + "loss": 0.014, + "step": 29620 + }, + { + "epoch": 1.7754209359458326, + "grad_norm": 0.296090304851532, + "learning_rate": 2.0072678444269208e-06, + "loss": 0.0145, + "step": 29630 + }, + { + "epoch": 1.7760201330217509, + "grad_norm": 0.2874438464641571, + "learning_rate": 2.006881376436789e-06, + "loss": 0.0133, + "step": 29640 + }, + { + "epoch": 1.7766193300976691, + "grad_norm": 0.2805752158164978, + "learning_rate": 2.0065054623722772e-06, + "loss": 0.0169, + "step": 29650 + }, + { + "epoch": 1.7772185271735874, + "grad_norm": 0.17779164016246796, + "learning_rate": 2.0061401026745425e-06, + "loss": 0.0118, + "step": 29660 + }, + { + "epoch": 1.7778177242495057, + "grad_norm": 0.316571444272995, + "learning_rate": 2.005785297772354e-06, + "loss": 0.0136, + "step": 29670 + }, + { + "epoch": 1.778416921325424, + "grad_norm": 0.8303540945053101, + "learning_rate": 2.005441048082095e-06, + "loss": 0.0172, + "step": 29680 + }, + { + "epoch": 1.7790161184013422, + "grad_norm": 0.3058635890483856, + "learning_rate": 2.0051073540077617e-06, + "loss": 0.0129, + "step": 29690 + }, + { + "epoch": 1.7796153154772605, + "grad_norm": 0.17514090240001678, + "learning_rate": 2.0047842159409633e-06, + "loss": 0.0099, + "step": 29700 + }, + { + "epoch": 1.7802145125531788, + "grad_norm": 0.22482258081436157, + "learning_rate": 2.004471634260919e-06, + "loss": 0.0131, + "step": 29710 + }, + { + "epoch": 1.780813709629097, + "grad_norm": 0.4026334881782532, + "learning_rate": 2.004169609334462e-06, + "loss": 0.0134, + "step": 29720 + }, + { + "epoch": 1.7814129067050153, + "grad_norm": 0.45236676931381226, + "learning_rate": 2.003878141516035e-06, + "loss": 0.0188, + "step": 29730 + }, + { + "epoch": 1.7820121037809336, + "grad_norm": 0.3150536119937897, + "learning_rate": 2.0035972311476916e-06, + "loss": 0.0126, + "step": 29740 + }, + { + "epoch": 1.7826113008568518, + "grad_norm": 0.9602782726287842, + "learning_rate": 2.0033268785590954e-06, + "loss": 0.0139, + "step": 29750 + }, + { + "epoch": 1.78321049793277, + "grad_norm": 0.2820151746273041, + "learning_rate": 2.003067084067522e-06, + "loss": 0.014, + "step": 29760 + }, + { + "epoch": 1.7838096950086884, + "grad_norm": 1.0188407897949219, + "learning_rate": 2.0028178479778523e-06, + "loss": 0.0137, + "step": 29770 + }, + { + "epoch": 1.7844088920846066, + "grad_norm": 0.26598837971687317, + "learning_rate": 2.0025791705825805e-06, + "loss": 0.0115, + "step": 29780 + }, + { + "epoch": 1.785008089160525, + "grad_norm": 0.2299095243215561, + "learning_rate": 2.0023510521618066e-06, + "loss": 0.0126, + "step": 29790 + }, + { + "epoch": 1.7856072862364432, + "grad_norm": 0.29679203033447266, + "learning_rate": 2.0021334929832407e-06, + "loss": 0.012, + "step": 29800 + }, + { + "epoch": 1.7862064833123614, + "grad_norm": 0.3352377116680145, + "learning_rate": 2.0019264933022016e-06, + "loss": 0.014, + "step": 29810 + }, + { + "epoch": 1.7868056803882797, + "grad_norm": 0.18228839337825775, + "learning_rate": 2.001730053361614e-06, + "loss": 0.0123, + "step": 29820 + }, + { + "epoch": 1.787404877464198, + "grad_norm": 0.5216359496116638, + "learning_rate": 2.0015441733920105e-06, + "loss": 0.0132, + "step": 29830 + }, + { + "epoch": 1.7880040745401162, + "grad_norm": 0.3130887746810913, + "learning_rate": 2.0013688536115332e-06, + "loss": 0.0168, + "step": 29840 + }, + { + "epoch": 1.7886032716160345, + "grad_norm": 0.4271252751350403, + "learning_rate": 2.0012040942259285e-06, + "loss": 0.0141, + "step": 29850 + }, + { + "epoch": 1.7892024686919528, + "grad_norm": 0.32060664892196655, + "learning_rate": 2.0010498954285506e-06, + "loss": 0.0134, + "step": 29860 + }, + { + "epoch": 1.789801665767871, + "grad_norm": 0.4360806345939636, + "learning_rate": 2.00090625740036e-06, + "loss": 0.0125, + "step": 29870 + }, + { + "epoch": 1.7904008628437893, + "grad_norm": 0.35824981331825256, + "learning_rate": 2.0007731803099256e-06, + "loss": 0.0129, + "step": 29880 + }, + { + "epoch": 1.7910000599197076, + "grad_norm": 0.37794366478919983, + "learning_rate": 2.00065066431342e-06, + "loss": 0.0151, + "step": 29890 + }, + { + "epoch": 1.7915992569956258, + "grad_norm": 0.302745521068573, + "learning_rate": 2.0005387095546222e-06, + "loss": 0.0142, + "step": 29900 + }, + { + "epoch": 1.792198454071544, + "grad_norm": 0.19773688912391663, + "learning_rate": 2.000437316164917e-06, + "loss": 0.0103, + "step": 29910 + }, + { + "epoch": 1.7927976511474624, + "grad_norm": 0.2933025658130646, + "learning_rate": 2.000346484263297e-06, + "loss": 0.0135, + "step": 29920 + }, + { + "epoch": 1.7933968482233806, + "grad_norm": 0.2572041451931, + "learning_rate": 2.0002662139563564e-06, + "loss": 0.0165, + "step": 29930 + }, + { + "epoch": 1.793996045299299, + "grad_norm": 0.6411796808242798, + "learning_rate": 2.0001965053382976e-06, + "loss": 0.0127, + "step": 29940 + }, + { + "epoch": 1.7945952423752172, + "grad_norm": 0.3087517321109772, + "learning_rate": 2.000137358490928e-06, + "loss": 0.0133, + "step": 29950 + }, + { + "epoch": 1.7951944394511354, + "grad_norm": 0.3539549708366394, + "learning_rate": 2.0000887734836583e-06, + "loss": 0.0117, + "step": 29960 + }, + { + "epoch": 1.7957936365270537, + "grad_norm": 0.3078557848930359, + "learning_rate": 2.0000507503735076e-06, + "loss": 0.0118, + "step": 29970 + }, + { + "epoch": 1.796392833602972, + "grad_norm": 0.44483524560928345, + "learning_rate": 2.0000232892050976e-06, + "loss": 0.0122, + "step": 29980 + }, + { + "epoch": 1.7969920306788902, + "grad_norm": 0.3110407888889313, + "learning_rate": 2.000006390010655e-06, + "loss": 0.0126, + "step": 29990 + }, + { + "epoch": 1.7975912277548085, + "grad_norm": 0.2597223222255707, + "learning_rate": 2.0000000528100118e-06, + "loss": 0.0138, + "step": 30000 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.873893288742748e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/checkpoint-30000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1176494009828ca1a8d623c603070781658572df --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/config.json @@ -0,0 +1,275 @@ +{ + "action_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 256000 + }, + "action_dim": 32, + "architectures": [ + "Pi0ForCausalLM" + ], + "assistant_token_number": -1, + "attention_bias": false, + "attention_dropout": 0.0, + "chunk_size": 50, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "image_aspect_ratio": "pad", + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "llm_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_bias": false, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 2, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 1, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "head_dim": 256, + "hidden_act": "gelu_pytorch_tanh", + "hidden_activation": null, + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_token_len": 256, + "initializer_range": 0.02, + "intermediate_size": 16384, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 8192, + "min_length": 0, + "model_type": "gemma", + "no_repeat_ngram_size": 0, + "num_attention_heads": 8, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "float32", + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "vocab_size": 257152 + }, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "linear", + "model_type": "dexbotic_pi0", + "num_attention_heads": 8, + "num_hidden_layers": 18, + "num_key_value_heads": 1, + "processor_config": "google/siglip-so400m-patch14-224", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000.0, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float32", + "transformers_version": "4.53.0", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": false, + "use_cache": true, + "use_mm_proj": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_fast", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 224, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "siglip_vision_model", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false, + "vision_use_head": false + }, + "vm_loss_enable": false, + "vocab_size": 257152 +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/generation_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c996a9f209aac9f32f94e3e5218df77d82de5b49 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 2, + "eos_token_id": 1, + "pad_token_id": 0, + "transformers_version": "4.53.0" +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00001-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ad9c79103582f8a1ffab88ea40d333e1b939f193 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f3654e6151059af0e9d70dae611c3f12b77f2dbb2d781046c7a4e2bb0c3d40e +size 4921072616 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00002-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8053a2a7749259e01ed494e85872dda42991a232 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a993cdf429e151c934401b7b37c60b257aab6ccaad1894566e8de90f9e681d62 +size 4978830984 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00003-of-00003.safetensors b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea15f9ac0b63661f6fdb5577eaa7e1c8183bf5c4 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8a9614714e21f66d58f97e71778ea4e124243339033c047480027acdd850f86 +size 4100977896 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model.safetensors.index.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..8a282edc2a4bae9e96e7a39ae6c495242c21c0f3 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/model.safetensors.index.json @@ -0,0 +1,785 @@ +{ + "metadata": { + "total_parameters": 3500192528, + "total_size": 14000770112 + }, + "weight_map": { + "model.action_expert.embed_tokens.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.gate_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.mlp.up_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.o_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.action_expert.norm.weight": "model-00003-of-00003.safetensors", + "model.action_in_proj.bias": "model-00003-of-00003.safetensors", + "model.action_in_proj.weight": "model-00003-of-00003.safetensors", + "model.action_out_proj.bias": "model-00003-of-00003.safetensors", + "model.action_out_proj.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_in.weight": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.bias": "model-00003-of-00003.safetensors", + "model.action_time_mlp_out.weight": "model-00003-of-00003.safetensors", + "model.llm.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors", + "model.llm.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors", + "model.llm.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.llm.norm.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00003-of-00003.safetensors", + "model.mm_vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00003-of-00003.safetensors", + "model.state_proj.bias": "model-00003-of-00003.safetensors", + "model.state_proj.weight": "model-00003-of-00003.safetensors" + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/norm_stats.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/norm_stats.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9801b876d4902a6f04c8f4fc65c072e6082867 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/norm_stats.json @@ -0,0 +1,282 @@ +{ + "default": { + "min": -1, + "max": 1 + }, + "action": { + "min": [ + -4.131592681121827, + -18.96289906921387, + -16.909606227111816, + -1.205507601451874, + -2.2364452423095704, + -1.8819086204528812, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 16.65274486618042, + 37.19429024200439, + 23.655689654541014, + 1.3209557065963748, + 2.6528479496955875, + 1.1486967510223387, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 2.868856906890869, + 6.296340465545654, + 1.3196077346801758, + 0.007151931058615446, + -0.012491658329963684, + -0.12626242637634277, + 0.12140887975692749, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 4.3321146965026855, + 12.4215087890625, + 7.703039169311523, + 0.391439288854599, + 0.8076039552688599, + 0.505150318145752, + 0.9926025867462158, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "state": { + "min": [ + -2.72874687538147, + -21.763728466033935, + -21.229162658691408, + -2.350775989151001, + -4.0587354017257695, + -3.285622364997864, + -1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "max": [ + 24.5495108631134, + 30.41332916412354, + 14.36571702880859, + 1.8286980584144592, + 2.2455153399467473, + 1.9114159921646117, + 0.9996, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "mean": [ + 6.469674587249756, + 1.137302041053772, + -3.50521183013916, + -0.009232619777321815, + -0.7088616490364075, + -0.43785586953163147, + 0.14176446199417114, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ], + "std": [ + 6.948984146118164, + 16.641460418701172, + 8.162801742553711, + 0.6890953779220581, + 1.1180040836334229, + 0.9564125537872314, + 0.9899004101753235, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + } +} \ No newline at end of file diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/pi0.yaml b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/pi0.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5354e9dc5f5e893fce703f391d3f02316f74c73 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/pi0.yaml @@ -0,0 +1,61 @@ +experiment: libero_pi0 + +model: + model_name_or_path: /work/hdd/bfbo/xzhang42/dexbotic/checkpoints/libero/libero_pi0 + chat_template: dexbotic + mm_projector_type: mlp2x_gelu + mm_vision_tower: openai/clip-vit-large-patch14-336 + +data: + num_images: 3 + data_keys: + - input_ids + - labels + - action + - image + - state + - image_masks + images_keys: + - images_1 + - images_2 + aug_policy: + - pi0 + - color + - identity + image_aspect_ratio: pad + image_pad_mode: mean + action_config: + trajectory_length: 50 + delta: true + padding_action: true + +trainer: + deepspeed: null + num_train_epochs: 1 + num_train_steps: 30000 + gradient_accumulation_steps: 1 + save_strategy: steps + save_total_limit: 10 + save_only_model: true + logging_steps: 10 + gradient_checkpointing: false + dataloader_num_workers: 16 + model_max_length: 48 + bf16: true + tf32: true + lr_scheduler_type: cosine_with_min_lr + lr_scheduler_kwargs: + min_lr_rate: 0.1 + +optimizer: + optim: adamw_torch + weight_decay: 1.0e-10 + warmup_ratio: 0.0 + warmup_steps: 1000 + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_epsilon: 1.0e-8 + +checkpointing: + base_dir: /work/nvme/bfbo/xzhang42/pi0_checkpoints + start_step: 0 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/special_tokens_map.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e65cf9102dd5e7815a2d697b923eee50560291 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/special_tokens_map.json @@ -0,0 +1,82 @@ +{ + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer.model b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer_config.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..94644f1d50d03cb640abf1be66d5a62b49924431 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/tokenizer_config.json @@ -0,0 +1,1950 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "256000": { + "content": " 0", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256001": { + "content": " 1", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256002": { + "content": " 2", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256003": { + "content": " 3", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256004": { + "content": " 4", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256005": { + "content": " 5", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256006": { + "content": " 6", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256007": { + "content": " 7", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256008": { + "content": " 8", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256009": { + "content": " 9", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256010": { + "content": " 10", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256011": { + "content": " 11", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256012": { + "content": " 12", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256013": { + "content": " 13", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256014": { + "content": " 14", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256015": { + "content": " 15", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256016": { + "content": " 16", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256017": { + "content": " 17", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256018": { + "content": " 18", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256019": { + "content": " 19", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256020": { + "content": " 20", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256021": { + "content": " 21", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256022": { + "content": " 22", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256023": { + "content": " 23", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256024": { + "content": " 24", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256025": { + "content": " 25", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256026": { + "content": " 26", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256027": { + "content": " 27", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256028": { + "content": " 28", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256029": { + "content": " 29", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256030": { + "content": " 30", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256031": { + "content": " 31", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256032": { + "content": " 32", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256033": { + "content": " 33", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256034": { + "content": " 34", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256035": { + "content": " 35", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256036": { + "content": " 36", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256037": { + "content": " 37", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256038": { + "content": " 38", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256039": { + "content": " 39", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256040": { + "content": " 40", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256041": { + "content": " 41", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256042": { + "content": " 42", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256043": { + "content": " 43", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256044": { + "content": " 44", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256045": { + "content": " 45", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256046": { + "content": " 46", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "256047": { + "content": " 47", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + " 0", + " 1", + " 2", + " 3", + " 4", + " 5", + " 6", + " 7", + " 8", + " 9", + " 10", + " 11", + " 12", + " 13", + " 14", + " 15", + " 16", + " 17", + " 18", + " 19", + " 20", + " 21", + " 22", + " 23", + " 24", + " 25", + " 26", + " 27", + " 28", + " 29", + " 30", + " 31", + " 32", + " 33", + " 34", + " 35", + " 36", + " 37", + " 38", + " 39", + " 40", + " 41", + " 42", + " 43", + " 44", + " 45", + " 46", + " 47" + ], + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "extra_special_tokens": {}, + "model_max_length": 48, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/trainer_state.json b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5cbb9709bb22fe4bb421a2e9324055b7dfacdc87 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/trainer_state.json @@ -0,0 +1,21043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.7975912277548085, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005991970759182695, + "grad_norm": 5.55898904800415, + "learning_rate": 1.8e-07, + "loss": 0.7669, + "step": 10 + }, + { + "epoch": 0.001198394151836539, + "grad_norm": 3.9875104427337646, + "learning_rate": 3.8e-07, + "loss": 0.7281, + "step": 20 + }, + { + "epoch": 0.0017975912277548086, + "grad_norm": 6.316451072692871, + "learning_rate": 5.800000000000001e-07, + "loss": 0.7134, + "step": 30 + }, + { + "epoch": 0.002396788303673078, + "grad_norm": 4.037688255310059, + "learning_rate": 7.8e-07, + "loss": 0.6077, + "step": 40 + }, + { + "epoch": 0.0029959853795913476, + "grad_norm": 5.4920220375061035, + "learning_rate": 9.800000000000001e-07, + "loss": 0.6779, + "step": 50 + }, + { + "epoch": 0.003595182455509617, + "grad_norm": 3.809985876083374, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.5578, + "step": 60 + }, + { + "epoch": 0.004194379531427887, + "grad_norm": 5.501481533050537, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.5453, + "step": 70 + }, + { + "epoch": 0.004793576607346156, + "grad_norm": 2.584683418273926, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4145, + "step": 80 + }, + { + "epoch": 0.005392773683264426, + "grad_norm": 2.854585886001587, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.3617, + "step": 90 + }, + { + "epoch": 0.005991970759182695, + "grad_norm": 3.2181553840637207, + "learning_rate": 1.98e-06, + "loss": 0.3402, + "step": 100 + }, + { + "epoch": 0.006591167835100964, + "grad_norm": 1.6713179349899292, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.2286, + "step": 110 + }, + { + "epoch": 0.007190364911019234, + "grad_norm": 2.60302996635437, + "learning_rate": 2.38e-06, + "loss": 0.2477, + "step": 120 + }, + { + "epoch": 0.0077895619869375035, + "grad_norm": 1.7488818168640137, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.1342, + "step": 130 + }, + { + "epoch": 0.008388759062855774, + "grad_norm": 1.826812982559204, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.1243, + "step": 140 + }, + { + "epoch": 0.008987956138774043, + "grad_norm": 1.1744091510772705, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.1012, + "step": 150 + }, + { + "epoch": 0.009587153214692312, + "grad_norm": 2.3573529720306396, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.1108, + "step": 160 + }, + { + "epoch": 0.010186350290610581, + "grad_norm": 2.1422371864318848, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.1081, + "step": 170 + }, + { + "epoch": 0.010785547366528852, + "grad_norm": 0.6756604313850403, + "learning_rate": 3.58e-06, + "loss": 0.0947, + "step": 180 + }, + { + "epoch": 0.011384744442447121, + "grad_norm": 1.8197052478790283, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.103, + "step": 190 + }, + { + "epoch": 0.01198394151836539, + "grad_norm": 2.135390281677246, + "learning_rate": 3.980000000000001e-06, + "loss": 0.0791, + "step": 200 + }, + { + "epoch": 0.01258313859428366, + "grad_norm": 1.185013771057129, + "learning_rate": 4.18e-06, + "loss": 0.0751, + "step": 210 + }, + { + "epoch": 0.013182335670201929, + "grad_norm": 1.478454828262329, + "learning_rate": 4.38e-06, + "loss": 0.0685, + "step": 220 + }, + { + "epoch": 0.0137815327461202, + "grad_norm": 1.1979939937591553, + "learning_rate": 4.58e-06, + "loss": 0.0642, + "step": 230 + }, + { + "epoch": 0.014380729822038469, + "grad_norm": 1.3315266370773315, + "learning_rate": 4.78e-06, + "loss": 0.0706, + "step": 240 + }, + { + "epoch": 0.014979926897956738, + "grad_norm": 1.219875454902649, + "learning_rate": 4.980000000000001e-06, + "loss": 0.0647, + "step": 250 + }, + { + "epoch": 0.015579123973875007, + "grad_norm": 1.9281997680664062, + "learning_rate": 5.18e-06, + "loss": 0.0781, + "step": 260 + }, + { + "epoch": 0.016178321049793276, + "grad_norm": 0.5599610209465027, + "learning_rate": 5.380000000000001e-06, + "loss": 0.0742, + "step": 270 + }, + { + "epoch": 0.016777518125711547, + "grad_norm": 0.9128719568252563, + "learning_rate": 5.580000000000001e-06, + "loss": 0.0638, + "step": 280 + }, + { + "epoch": 0.017376715201629814, + "grad_norm": 0.5633432269096375, + "learning_rate": 5.78e-06, + "loss": 0.0633, + "step": 290 + }, + { + "epoch": 0.017975912277548085, + "grad_norm": 0.7961149215698242, + "learning_rate": 5.98e-06, + "loss": 0.062, + "step": 300 + }, + { + "epoch": 0.018575109353466356, + "grad_norm": 1.9408375024795532, + "learning_rate": 6.18e-06, + "loss": 0.0708, + "step": 310 + }, + { + "epoch": 0.019174306429384624, + "grad_norm": 1.1925369501113892, + "learning_rate": 6.380000000000001e-06, + "loss": 0.0654, + "step": 320 + }, + { + "epoch": 0.019773503505302895, + "grad_norm": 1.0636825561523438, + "learning_rate": 6.5800000000000005e-06, + "loss": 0.0513, + "step": 330 + }, + { + "epoch": 0.020372700581221162, + "grad_norm": 0.5671424865722656, + "learning_rate": 6.780000000000001e-06, + "loss": 0.0561, + "step": 340 + }, + { + "epoch": 0.020971897657139433, + "grad_norm": 0.8431388139724731, + "learning_rate": 6.98e-06, + "loss": 0.0514, + "step": 350 + }, + { + "epoch": 0.021571094733057704, + "grad_norm": 1.3813819885253906, + "learning_rate": 7.180000000000001e-06, + "loss": 0.0619, + "step": 360 + }, + { + "epoch": 0.02217029180897597, + "grad_norm": 0.7528055906295776, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.0502, + "step": 370 + }, + { + "epoch": 0.022769488884894242, + "grad_norm": 1.38446044921875, + "learning_rate": 7.58e-06, + "loss": 0.0623, + "step": 380 + }, + { + "epoch": 0.02336868596081251, + "grad_norm": 0.9472984671592712, + "learning_rate": 7.78e-06, + "loss": 0.0471, + "step": 390 + }, + { + "epoch": 0.02396788303673078, + "grad_norm": 0.640555739402771, + "learning_rate": 7.980000000000002e-06, + "loss": 0.0539, + "step": 400 + }, + { + "epoch": 0.02456708011264905, + "grad_norm": 1.4841065406799316, + "learning_rate": 8.18e-06, + "loss": 0.0684, + "step": 410 + }, + { + "epoch": 0.02516627718856732, + "grad_norm": 1.0691452026367188, + "learning_rate": 8.380000000000001e-06, + "loss": 0.0576, + "step": 420 + }, + { + "epoch": 0.02576547426448559, + "grad_norm": 0.8026740550994873, + "learning_rate": 8.580000000000001e-06, + "loss": 0.0579, + "step": 430 + }, + { + "epoch": 0.026364671340403857, + "grad_norm": 1.3472259044647217, + "learning_rate": 8.78e-06, + "loss": 0.0725, + "step": 440 + }, + { + "epoch": 0.026963868416322128, + "grad_norm": 0.8364902138710022, + "learning_rate": 8.98e-06, + "loss": 0.0613, + "step": 450 + }, + { + "epoch": 0.0275630654922404, + "grad_norm": 1.0086181163787842, + "learning_rate": 9.180000000000002e-06, + "loss": 0.0558, + "step": 460 + }, + { + "epoch": 0.028162262568158666, + "grad_norm": 1.0559569597244263, + "learning_rate": 9.38e-06, + "loss": 0.0561, + "step": 470 + }, + { + "epoch": 0.028761459644076937, + "grad_norm": 0.9138600826263428, + "learning_rate": 9.58e-06, + "loss": 0.0507, + "step": 480 + }, + { + "epoch": 0.029360656719995208, + "grad_norm": 0.6099390387535095, + "learning_rate": 9.780000000000001e-06, + "loss": 0.0543, + "step": 490 + }, + { + "epoch": 0.029959853795913476, + "grad_norm": 0.890690803527832, + "learning_rate": 9.980000000000001e-06, + "loss": 0.071, + "step": 500 + }, + { + "epoch": 0.030559050871831746, + "grad_norm": 0.8349231481552124, + "learning_rate": 1.018e-05, + "loss": 0.0515, + "step": 510 + }, + { + "epoch": 0.031158247947750014, + "grad_norm": 1.5466762781143188, + "learning_rate": 1.038e-05, + "loss": 0.0865, + "step": 520 + }, + { + "epoch": 0.031757445023668285, + "grad_norm": 1.0859519243240356, + "learning_rate": 1.0580000000000002e-05, + "loss": 0.0511, + "step": 530 + }, + { + "epoch": 0.03235664209958655, + "grad_norm": 0.7235454320907593, + "learning_rate": 1.0780000000000002e-05, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.03295583917550483, + "grad_norm": 0.6314525008201599, + "learning_rate": 1.0980000000000002e-05, + "loss": 0.0494, + "step": 550 + }, + { + "epoch": 0.033555036251423094, + "grad_norm": 1.5067164897918701, + "learning_rate": 1.1180000000000001e-05, + "loss": 0.0453, + "step": 560 + }, + { + "epoch": 0.03415423332734136, + "grad_norm": 0.9329689145088196, + "learning_rate": 1.138e-05, + "loss": 0.0565, + "step": 570 + }, + { + "epoch": 0.03475343040325963, + "grad_norm": 1.3631505966186523, + "learning_rate": 1.1580000000000001e-05, + "loss": 0.0513, + "step": 580 + }, + { + "epoch": 0.0353526274791779, + "grad_norm": 1.2341063022613525, + "learning_rate": 1.178e-05, + "loss": 0.0468, + "step": 590 + }, + { + "epoch": 0.03595182455509617, + "grad_norm": 0.7126315832138062, + "learning_rate": 1.198e-05, + "loss": 0.0465, + "step": 600 + }, + { + "epoch": 0.03655102163101444, + "grad_norm": 0.9995419383049011, + "learning_rate": 1.218e-05, + "loss": 0.0423, + "step": 610 + }, + { + "epoch": 0.03715021870693271, + "grad_norm": 0.7614652514457703, + "learning_rate": 1.2380000000000002e-05, + "loss": 0.0466, + "step": 620 + }, + { + "epoch": 0.03774941578285098, + "grad_norm": 0.7718682289123535, + "learning_rate": 1.2580000000000002e-05, + "loss": 0.0508, + "step": 630 + }, + { + "epoch": 0.03834861285876925, + "grad_norm": 0.7280911803245544, + "learning_rate": 1.2780000000000001e-05, + "loss": 0.0481, + "step": 640 + }, + { + "epoch": 0.03894780993468752, + "grad_norm": 0.6350377798080444, + "learning_rate": 1.2980000000000001e-05, + "loss": 0.0493, + "step": 650 + }, + { + "epoch": 0.03954700701060579, + "grad_norm": 0.6868598461151123, + "learning_rate": 1.3180000000000001e-05, + "loss": 0.057, + "step": 660 + }, + { + "epoch": 0.04014620408652406, + "grad_norm": 1.132020354270935, + "learning_rate": 1.3380000000000002e-05, + "loss": 0.0464, + "step": 670 + }, + { + "epoch": 0.040745401162442324, + "grad_norm": 1.097875952720642, + "learning_rate": 1.3580000000000002e-05, + "loss": 0.0465, + "step": 680 + }, + { + "epoch": 0.0413445982383606, + "grad_norm": 0.8246905207633972, + "learning_rate": 1.378e-05, + "loss": 0.0488, + "step": 690 + }, + { + "epoch": 0.041943795314278866, + "grad_norm": 0.5858931541442871, + "learning_rate": 1.398e-05, + "loss": 0.0533, + "step": 700 + }, + { + "epoch": 0.04254299239019713, + "grad_norm": 0.7141314744949341, + "learning_rate": 1.418e-05, + "loss": 0.057, + "step": 710 + }, + { + "epoch": 0.04314218946611541, + "grad_norm": 0.87618488073349, + "learning_rate": 1.4380000000000001e-05, + "loss": 0.0417, + "step": 720 + }, + { + "epoch": 0.043741386542033675, + "grad_norm": 0.8312808871269226, + "learning_rate": 1.4580000000000001e-05, + "loss": 0.0627, + "step": 730 + }, + { + "epoch": 0.04434058361795194, + "grad_norm": 0.5213949680328369, + "learning_rate": 1.478e-05, + "loss": 0.0526, + "step": 740 + }, + { + "epoch": 0.04493978069387022, + "grad_norm": 0.7599508762359619, + "learning_rate": 1.498e-05, + "loss": 0.0487, + "step": 750 + }, + { + "epoch": 0.045538977769788484, + "grad_norm": 0.9282987713813782, + "learning_rate": 1.5180000000000002e-05, + "loss": 0.0544, + "step": 760 + }, + { + "epoch": 0.04613817484570675, + "grad_norm": 1.5959566831588745, + "learning_rate": 1.5380000000000002e-05, + "loss": 0.0594, + "step": 770 + }, + { + "epoch": 0.04673737192162502, + "grad_norm": 0.6384497284889221, + "learning_rate": 1.5580000000000003e-05, + "loss": 0.049, + "step": 780 + }, + { + "epoch": 0.047336568997543294, + "grad_norm": 0.5377854108810425, + "learning_rate": 1.578e-05, + "loss": 0.0529, + "step": 790 + }, + { + "epoch": 0.04793576607346156, + "grad_norm": 0.6186609864234924, + "learning_rate": 1.5980000000000003e-05, + "loss": 0.0485, + "step": 800 + }, + { + "epoch": 0.04853496314937983, + "grad_norm": 0.9750168323516846, + "learning_rate": 1.618e-05, + "loss": 0.0458, + "step": 810 + }, + { + "epoch": 0.0491341602252981, + "grad_norm": 0.6810588836669922, + "learning_rate": 1.638e-05, + "loss": 0.0521, + "step": 820 + }, + { + "epoch": 0.04973335730121637, + "grad_norm": 0.8613447546958923, + "learning_rate": 1.658e-05, + "loss": 0.0464, + "step": 830 + }, + { + "epoch": 0.05033255437713464, + "grad_norm": 0.8379164338111877, + "learning_rate": 1.6780000000000002e-05, + "loss": 0.0589, + "step": 840 + }, + { + "epoch": 0.05093175145305291, + "grad_norm": 0.9312345385551453, + "learning_rate": 1.698e-05, + "loss": 0.0534, + "step": 850 + }, + { + "epoch": 0.05153094852897118, + "grad_norm": 0.6983106732368469, + "learning_rate": 1.718e-05, + "loss": 0.0591, + "step": 860 + }, + { + "epoch": 0.05213014560488945, + "grad_norm": 0.6549938321113586, + "learning_rate": 1.7380000000000003e-05, + "loss": 0.0571, + "step": 870 + }, + { + "epoch": 0.052729342680807714, + "grad_norm": 0.3887499272823334, + "learning_rate": 1.758e-05, + "loss": 0.0362, + "step": 880 + }, + { + "epoch": 0.05332853975672599, + "grad_norm": 1.1392686367034912, + "learning_rate": 1.7780000000000003e-05, + "loss": 0.0602, + "step": 890 + }, + { + "epoch": 0.053927736832644256, + "grad_norm": 0.834979772567749, + "learning_rate": 1.798e-05, + "loss": 0.0483, + "step": 900 + }, + { + "epoch": 0.054526933908562523, + "grad_norm": 0.9094700813293457, + "learning_rate": 1.8180000000000002e-05, + "loss": 0.0536, + "step": 910 + }, + { + "epoch": 0.0551261309844808, + "grad_norm": 0.9519254565238953, + "learning_rate": 1.8380000000000004e-05, + "loss": 0.0514, + "step": 920 + }, + { + "epoch": 0.055725328060399065, + "grad_norm": 0.6514044404029846, + "learning_rate": 1.858e-05, + "loss": 0.0466, + "step": 930 + }, + { + "epoch": 0.05632452513631733, + "grad_norm": 0.6005147099494934, + "learning_rate": 1.878e-05, + "loss": 0.0527, + "step": 940 + }, + { + "epoch": 0.05692372221223561, + "grad_norm": 1.0990339517593384, + "learning_rate": 1.898e-05, + "loss": 0.0453, + "step": 950 + }, + { + "epoch": 0.057522919288153875, + "grad_norm": 0.7029110193252563, + "learning_rate": 1.918e-05, + "loss": 0.0527, + "step": 960 + }, + { + "epoch": 0.05812211636407214, + "grad_norm": 0.6106461882591248, + "learning_rate": 1.938e-05, + "loss": 0.043, + "step": 970 + }, + { + "epoch": 0.058721313439990416, + "grad_norm": 0.48976996541023254, + "learning_rate": 1.9580000000000002e-05, + "loss": 0.0482, + "step": 980 + }, + { + "epoch": 0.059320510515908684, + "grad_norm": 1.045139193534851, + "learning_rate": 1.978e-05, + "loss": 0.0449, + "step": 990 + }, + { + "epoch": 0.05991970759182695, + "grad_norm": 0.7444337010383606, + "learning_rate": 1.9980000000000002e-05, + "loss": 0.0499, + "step": 1000 + }, + { + "epoch": 0.06051890466774522, + "grad_norm": 0.8378720879554749, + "learning_rate": 1.9999995722389397e-05, + "loss": 0.0606, + "step": 1010 + }, + { + "epoch": 0.06111810174366349, + "grad_norm": 0.5345956683158875, + "learning_rate": 1.9999980935592526e-05, + "loss": 0.041, + "step": 1020 + }, + { + "epoch": 0.06171729881958176, + "grad_norm": 0.6428268551826477, + "learning_rate": 1.9999955586816727e-05, + "loss": 0.0648, + "step": 1030 + }, + { + "epoch": 0.06231649589550003, + "grad_norm": 0.9010246992111206, + "learning_rate": 1.9999919676091748e-05, + "loss": 0.0441, + "step": 1040 + }, + { + "epoch": 0.0629156929714183, + "grad_norm": 0.6655222177505493, + "learning_rate": 1.9999873203459737e-05, + "loss": 0.0532, + "step": 1050 + }, + { + "epoch": 0.06351489004733657, + "grad_norm": 0.5328973531723022, + "learning_rate": 1.999981616897523e-05, + "loss": 0.0488, + "step": 1060 + }, + { + "epoch": 0.06411408712325484, + "grad_norm": 1.2394806146621704, + "learning_rate": 1.9999748572705168e-05, + "loss": 0.0525, + "step": 1070 + }, + { + "epoch": 0.0647132841991731, + "grad_norm": 0.9671902656555176, + "learning_rate": 1.999967041472886e-05, + "loss": 0.051, + "step": 1080 + }, + { + "epoch": 0.06531248127509137, + "grad_norm": 0.8754792213439941, + "learning_rate": 1.9999581695138044e-05, + "loss": 0.054, + "step": 1090 + }, + { + "epoch": 0.06591167835100965, + "grad_norm": 0.524354875087738, + "learning_rate": 1.9999482414036835e-05, + "loss": 0.0682, + "step": 1100 + }, + { + "epoch": 0.06651087542692792, + "grad_norm": 1.0633796453475952, + "learning_rate": 1.9999372571541743e-05, + "loss": 0.0435, + "step": 1110 + }, + { + "epoch": 0.06711007250284619, + "grad_norm": 0.7348024249076843, + "learning_rate": 1.999925216778167e-05, + "loss": 0.0436, + "step": 1120 + }, + { + "epoch": 0.06770926957876446, + "grad_norm": 0.923546552658081, + "learning_rate": 1.9999121202897924e-05, + "loss": 0.0501, + "step": 1130 + }, + { + "epoch": 0.06830846665468272, + "grad_norm": 1.0579051971435547, + "learning_rate": 1.9998979677044197e-05, + "loss": 0.0562, + "step": 1140 + }, + { + "epoch": 0.06890766373060099, + "grad_norm": 0.8214036822319031, + "learning_rate": 1.999882759038658e-05, + "loss": 0.057, + "step": 1150 + }, + { + "epoch": 0.06950686080651926, + "grad_norm": 0.7640904188156128, + "learning_rate": 1.9998664943103546e-05, + "loss": 0.0468, + "step": 1160 + }, + { + "epoch": 0.07010605788243754, + "grad_norm": 0.5744732022285461, + "learning_rate": 1.999849173538598e-05, + "loss": 0.0416, + "step": 1170 + }, + { + "epoch": 0.0707052549583558, + "grad_norm": 0.40397152304649353, + "learning_rate": 1.9998307967437146e-05, + "loss": 0.0389, + "step": 1180 + }, + { + "epoch": 0.07130445203427407, + "grad_norm": 0.6207796931266785, + "learning_rate": 1.999811363947271e-05, + "loss": 0.0484, + "step": 1190 + }, + { + "epoch": 0.07190364911019234, + "grad_norm": 1.5230320692062378, + "learning_rate": 1.9997908751720726e-05, + "loss": 0.0586, + "step": 1200 + }, + { + "epoch": 0.07250284618611061, + "grad_norm": 0.8499330282211304, + "learning_rate": 1.9997693304421636e-05, + "loss": 0.0671, + "step": 1210 + }, + { + "epoch": 0.07310204326202888, + "grad_norm": 0.7697583436965942, + "learning_rate": 1.9997467297828283e-05, + "loss": 0.061, + "step": 1220 + }, + { + "epoch": 0.07370124033794716, + "grad_norm": 0.6107252836227417, + "learning_rate": 1.9997230732205896e-05, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.07430043741386542, + "grad_norm": 0.40468829870224, + "learning_rate": 1.9996983607832105e-05, + "loss": 0.0558, + "step": 1240 + }, + { + "epoch": 0.07489963448978369, + "grad_norm": 0.7711566686630249, + "learning_rate": 1.999672592499692e-05, + "loss": 0.0487, + "step": 1250 + }, + { + "epoch": 0.07549883156570196, + "grad_norm": 1.0216137170791626, + "learning_rate": 1.999645768400274e-05, + "loss": 0.0411, + "step": 1260 + }, + { + "epoch": 0.07609802864162023, + "grad_norm": 1.1135109663009644, + "learning_rate": 1.9996178885164368e-05, + "loss": 0.0428, + "step": 1270 + }, + { + "epoch": 0.0766972257175385, + "grad_norm": 0.545289158821106, + "learning_rate": 1.999588952880899e-05, + "loss": 0.0426, + "step": 1280 + }, + { + "epoch": 0.07729642279345676, + "grad_norm": 0.9514102339744568, + "learning_rate": 1.999558961527618e-05, + "loss": 0.0529, + "step": 1290 + }, + { + "epoch": 0.07789561986937504, + "grad_norm": 0.9448748826980591, + "learning_rate": 1.9995279144917905e-05, + "loss": 0.0468, + "step": 1300 + }, + { + "epoch": 0.07849481694529331, + "grad_norm": 1.1176340579986572, + "learning_rate": 1.9994958118098517e-05, + "loss": 0.06, + "step": 1310 + }, + { + "epoch": 0.07909401402121158, + "grad_norm": 0.6428054571151733, + "learning_rate": 1.9994626535194757e-05, + "loss": 0.0398, + "step": 1320 + }, + { + "epoch": 0.07969321109712985, + "grad_norm": 0.8000763058662415, + "learning_rate": 1.999428439659576e-05, + "loss": 0.0688, + "step": 1330 + }, + { + "epoch": 0.08029240817304811, + "grad_norm": 0.7624617218971252, + "learning_rate": 1.9993931702703046e-05, + "loss": 0.0524, + "step": 1340 + }, + { + "epoch": 0.08089160524896638, + "grad_norm": 0.7986068725585938, + "learning_rate": 1.9993568453930513e-05, + "loss": 0.0511, + "step": 1350 + }, + { + "epoch": 0.08149080232488465, + "grad_norm": 1.179044246673584, + "learning_rate": 1.999319465070446e-05, + "loss": 0.0518, + "step": 1360 + }, + { + "epoch": 0.08208999940080293, + "grad_norm": 0.7511209845542908, + "learning_rate": 1.9992810293463564e-05, + "loss": 0.041, + "step": 1370 + }, + { + "epoch": 0.0826891964767212, + "grad_norm": 0.8336644768714905, + "learning_rate": 1.9992415382658894e-05, + "loss": 0.0483, + "step": 1380 + }, + { + "epoch": 0.08328839355263946, + "grad_norm": 0.7198546528816223, + "learning_rate": 1.9992009918753896e-05, + "loss": 0.0472, + "step": 1390 + }, + { + "epoch": 0.08388759062855773, + "grad_norm": 1.404756784439087, + "learning_rate": 1.99915939022244e-05, + "loss": 0.0479, + "step": 1400 + }, + { + "epoch": 0.084486787704476, + "grad_norm": 0.861412525177002, + "learning_rate": 1.9991167333558633e-05, + "loss": 0.0448, + "step": 1410 + }, + { + "epoch": 0.08508598478039427, + "grad_norm": 1.2575286626815796, + "learning_rate": 1.9990730213257187e-05, + "loss": 0.0504, + "step": 1420 + }, + { + "epoch": 0.08568518185631255, + "grad_norm": 0.7020149230957031, + "learning_rate": 1.9990282541833063e-05, + "loss": 0.0416, + "step": 1430 + }, + { + "epoch": 0.08628437893223082, + "grad_norm": 0.9072129726409912, + "learning_rate": 1.998982431981161e-05, + "loss": 0.0483, + "step": 1440 + }, + { + "epoch": 0.08688357600814908, + "grad_norm": 0.5503928661346436, + "learning_rate": 1.998935554773059e-05, + "loss": 0.0498, + "step": 1450 + }, + { + "epoch": 0.08748277308406735, + "grad_norm": 0.5776561498641968, + "learning_rate": 1.9988876226140126e-05, + "loss": 0.0399, + "step": 1460 + }, + { + "epoch": 0.08808197015998562, + "grad_norm": 0.7854406237602234, + "learning_rate": 1.998838635560274e-05, + "loss": 0.0431, + "step": 1470 + }, + { + "epoch": 0.08868116723590388, + "grad_norm": 0.7011817097663879, + "learning_rate": 1.9987885936693304e-05, + "loss": 0.0615, + "step": 1480 + }, + { + "epoch": 0.08928036431182215, + "grad_norm": 0.7760916352272034, + "learning_rate": 1.9987374969999102e-05, + "loss": 0.0525, + "step": 1490 + }, + { + "epoch": 0.08987956138774043, + "grad_norm": 0.9866206049919128, + "learning_rate": 1.9986853456119776e-05, + "loss": 0.0492, + "step": 1500 + }, + { + "epoch": 0.0904787584636587, + "grad_norm": 0.7466640472412109, + "learning_rate": 1.998632139566735e-05, + "loss": 0.0564, + "step": 1510 + }, + { + "epoch": 0.09107795553957697, + "grad_norm": 0.8808642029762268, + "learning_rate": 1.9985778789266233e-05, + "loss": 0.0461, + "step": 1520 + }, + { + "epoch": 0.09167715261549524, + "grad_norm": 0.8980852365493774, + "learning_rate": 1.99852256375532e-05, + "loss": 0.0613, + "step": 1530 + }, + { + "epoch": 0.0922763496914135, + "grad_norm": 0.6824257969856262, + "learning_rate": 1.9984661941177402e-05, + "loss": 0.0763, + "step": 1540 + }, + { + "epoch": 0.09287554676733177, + "grad_norm": 0.681532084941864, + "learning_rate": 1.9984087700800375e-05, + "loss": 0.0492, + "step": 1550 + }, + { + "epoch": 0.09347474384325004, + "grad_norm": 0.5667393207550049, + "learning_rate": 1.9983502917096018e-05, + "loss": 0.0471, + "step": 1560 + }, + { + "epoch": 0.09407394091916832, + "grad_norm": 0.5026432275772095, + "learning_rate": 1.9982907590750607e-05, + "loss": 0.0424, + "step": 1570 + }, + { + "epoch": 0.09467313799508659, + "grad_norm": 0.37448638677597046, + "learning_rate": 1.9982301722462793e-05, + "loss": 0.037, + "step": 1580 + }, + { + "epoch": 0.09527233507100485, + "grad_norm": 0.6236661076545715, + "learning_rate": 1.9981685312943594e-05, + "loss": 0.0557, + "step": 1590 + }, + { + "epoch": 0.09587153214692312, + "grad_norm": 0.9748323559761047, + "learning_rate": 1.9981058362916402e-05, + "loss": 0.0326, + "step": 1600 + }, + { + "epoch": 0.09647072922284139, + "grad_norm": 0.7733910083770752, + "learning_rate": 1.9980420873116975e-05, + "loss": 0.0527, + "step": 1610 + }, + { + "epoch": 0.09706992629875966, + "grad_norm": 0.6466084718704224, + "learning_rate": 1.9979772844293444e-05, + "loss": 0.0537, + "step": 1620 + }, + { + "epoch": 0.09766912337467794, + "grad_norm": 0.6644402146339417, + "learning_rate": 1.9979114277206313e-05, + "loss": 0.0434, + "step": 1630 + }, + { + "epoch": 0.0982683204505962, + "grad_norm": 1.5936143398284912, + "learning_rate": 1.997844517262844e-05, + "loss": 0.0495, + "step": 1640 + }, + { + "epoch": 0.09886751752651447, + "grad_norm": 0.5655786991119385, + "learning_rate": 1.9977765531345057e-05, + "loss": 0.0475, + "step": 1650 + }, + { + "epoch": 0.09946671460243274, + "grad_norm": 0.9557194709777832, + "learning_rate": 1.9977075354153766e-05, + "loss": 0.0518, + "step": 1660 + }, + { + "epoch": 0.10006591167835101, + "grad_norm": 0.8929481506347656, + "learning_rate": 1.9976374641864525e-05, + "loss": 0.0435, + "step": 1670 + }, + { + "epoch": 0.10066510875426928, + "grad_norm": 0.7515624761581421, + "learning_rate": 1.9975663395299656e-05, + "loss": 0.0404, + "step": 1680 + }, + { + "epoch": 0.10126430583018754, + "grad_norm": 0.7718303203582764, + "learning_rate": 1.997494161529385e-05, + "loss": 0.0476, + "step": 1690 + }, + { + "epoch": 0.10186350290610582, + "grad_norm": 0.5583183765411377, + "learning_rate": 1.9974209302694156e-05, + "loss": 0.0495, + "step": 1700 + }, + { + "epoch": 0.10246269998202409, + "grad_norm": 0.7166038155555725, + "learning_rate": 1.9973466458359982e-05, + "loss": 0.0601, + "step": 1710 + }, + { + "epoch": 0.10306189705794236, + "grad_norm": 0.9311782717704773, + "learning_rate": 1.99727130831631e-05, + "loss": 0.0507, + "step": 1720 + }, + { + "epoch": 0.10366109413386063, + "grad_norm": 0.6159361600875854, + "learning_rate": 1.9971949177987635e-05, + "loss": 0.0319, + "step": 1730 + }, + { + "epoch": 0.1042602912097789, + "grad_norm": 0.816769003868103, + "learning_rate": 1.9971174743730074e-05, + "loss": 0.0505, + "step": 1740 + }, + { + "epoch": 0.10485948828569716, + "grad_norm": 0.9040331244468689, + "learning_rate": 1.9970389781299258e-05, + "loss": 0.0498, + "step": 1750 + }, + { + "epoch": 0.10545868536161543, + "grad_norm": 1.696012020111084, + "learning_rate": 1.9969594291616384e-05, + "loss": 0.0689, + "step": 1760 + }, + { + "epoch": 0.10605788243753371, + "grad_norm": 0.5169436931610107, + "learning_rate": 1.9968788275615002e-05, + "loss": 0.0414, + "step": 1770 + }, + { + "epoch": 0.10665707951345198, + "grad_norm": 1.9156256914138794, + "learning_rate": 1.996797173424102e-05, + "loss": 0.0558, + "step": 1780 + }, + { + "epoch": 0.10725627658937024, + "grad_norm": 0.6522107720375061, + "learning_rate": 1.996714466845269e-05, + "loss": 0.0427, + "step": 1790 + }, + { + "epoch": 0.10785547366528851, + "grad_norm": 0.8480607867240906, + "learning_rate": 1.9966307079220628e-05, + "loss": 0.0425, + "step": 1800 + }, + { + "epoch": 0.10845467074120678, + "grad_norm": 0.6939795017242432, + "learning_rate": 1.9965458967527784e-05, + "loss": 0.0521, + "step": 1810 + }, + { + "epoch": 0.10905386781712505, + "grad_norm": 0.5763843059539795, + "learning_rate": 1.9964600334369466e-05, + "loss": 0.0486, + "step": 1820 + }, + { + "epoch": 0.10965306489304333, + "grad_norm": 1.6420201063156128, + "learning_rate": 1.996373118075333e-05, + "loss": 0.0428, + "step": 1830 + }, + { + "epoch": 0.1102522619689616, + "grad_norm": 0.5305889248847961, + "learning_rate": 1.9962851507699373e-05, + "loss": 0.0371, + "step": 1840 + }, + { + "epoch": 0.11085145904487986, + "grad_norm": 1.3216971158981323, + "learning_rate": 1.9961961316239944e-05, + "loss": 0.0441, + "step": 1850 + }, + { + "epoch": 0.11145065612079813, + "grad_norm": 0.6441370844841003, + "learning_rate": 1.996106060741973e-05, + "loss": 0.0444, + "step": 1860 + }, + { + "epoch": 0.1120498531967164, + "grad_norm": 1.4227683544158936, + "learning_rate": 1.996014938229576e-05, + "loss": 0.053, + "step": 1870 + }, + { + "epoch": 0.11264905027263467, + "grad_norm": 0.667000412940979, + "learning_rate": 1.9959227641937415e-05, + "loss": 0.0405, + "step": 1880 + }, + { + "epoch": 0.11324824734855293, + "grad_norm": 0.6865925192832947, + "learning_rate": 1.99582953874264e-05, + "loss": 0.0532, + "step": 1890 + }, + { + "epoch": 0.11384744442447121, + "grad_norm": 0.8819414377212524, + "learning_rate": 1.9957352619856778e-05, + "loss": 0.0402, + "step": 1900 + }, + { + "epoch": 0.11444664150038948, + "grad_norm": 0.8738685250282288, + "learning_rate": 1.995639934033493e-05, + "loss": 0.0494, + "step": 1910 + }, + { + "epoch": 0.11504583857630775, + "grad_norm": 0.8790421485900879, + "learning_rate": 1.9955435549979585e-05, + "loss": 0.0753, + "step": 1920 + }, + { + "epoch": 0.11564503565222602, + "grad_norm": 0.5451251268386841, + "learning_rate": 1.9954461249921804e-05, + "loss": 0.0385, + "step": 1930 + }, + { + "epoch": 0.11624423272814428, + "grad_norm": 0.46721863746643066, + "learning_rate": 1.9953476441304988e-05, + "loss": 0.0395, + "step": 1940 + }, + { + "epoch": 0.11684342980406255, + "grad_norm": 0.41896265745162964, + "learning_rate": 1.995248112528486e-05, + "loss": 0.0461, + "step": 1950 + }, + { + "epoch": 0.11744262687998083, + "grad_norm": 0.7582527995109558, + "learning_rate": 1.9951475303029478e-05, + "loss": 0.0461, + "step": 1960 + }, + { + "epoch": 0.1180418239558991, + "grad_norm": 0.7154091596603394, + "learning_rate": 1.9950458975719234e-05, + "loss": 0.0464, + "step": 1970 + }, + { + "epoch": 0.11864102103181737, + "grad_norm": 0.788686215877533, + "learning_rate": 1.994943214454684e-05, + "loss": 0.0368, + "step": 1980 + }, + { + "epoch": 0.11924021810773563, + "grad_norm": 0.46885132789611816, + "learning_rate": 1.9948394810717342e-05, + "loss": 0.0472, + "step": 1990 + }, + { + "epoch": 0.1198394151836539, + "grad_norm": 0.5174703598022461, + "learning_rate": 1.9947346975448113e-05, + "loss": 0.0501, + "step": 2000 + }, + { + "epoch": 0.12043861225957217, + "grad_norm": 0.8058022260665894, + "learning_rate": 1.9946288639968838e-05, + "loss": 0.044, + "step": 2010 + }, + { + "epoch": 0.12103780933549044, + "grad_norm": 0.49327152967453003, + "learning_rate": 1.9945219805521535e-05, + "loss": 0.0404, + "step": 2020 + }, + { + "epoch": 0.12163700641140872, + "grad_norm": 1.532515048980713, + "learning_rate": 1.9944140473360548e-05, + "loss": 0.0548, + "step": 2030 + }, + { + "epoch": 0.12223620348732699, + "grad_norm": 1.1101130247116089, + "learning_rate": 1.994305064475253e-05, + "loss": 0.0542, + "step": 2040 + }, + { + "epoch": 0.12283540056324525, + "grad_norm": 0.7396823763847351, + "learning_rate": 1.9941950320976447e-05, + "loss": 0.042, + "step": 2050 + }, + { + "epoch": 0.12343459763916352, + "grad_norm": 0.5801792740821838, + "learning_rate": 1.99408395033236e-05, + "loss": 0.0589, + "step": 2060 + }, + { + "epoch": 0.12403379471508179, + "grad_norm": 1.4451886415481567, + "learning_rate": 1.993971819309759e-05, + "loss": 0.0402, + "step": 2070 + }, + { + "epoch": 0.12463299179100006, + "grad_norm": 0.61793053150177, + "learning_rate": 1.9938586391614344e-05, + "loss": 0.0583, + "step": 2080 + }, + { + "epoch": 0.12523218886691834, + "grad_norm": 0.8073042631149292, + "learning_rate": 1.9937444100202087e-05, + "loss": 0.0492, + "step": 2090 + }, + { + "epoch": 0.1258313859428366, + "grad_norm": 0.9468027949333191, + "learning_rate": 1.9936291320201364e-05, + "loss": 0.0466, + "step": 2100 + }, + { + "epoch": 0.12643058301875487, + "grad_norm": 0.7384629249572754, + "learning_rate": 1.9935128052965026e-05, + "loss": 0.0589, + "step": 2110 + }, + { + "epoch": 0.12702978009467314, + "grad_norm": 0.4612124562263489, + "learning_rate": 1.9933954299858232e-05, + "loss": 0.043, + "step": 2120 + }, + { + "epoch": 0.1276289771705914, + "grad_norm": 0.6821345090866089, + "learning_rate": 1.9932770062258448e-05, + "loss": 0.0373, + "step": 2130 + }, + { + "epoch": 0.12822817424650967, + "grad_norm": 0.6727206110954285, + "learning_rate": 1.9931575341555444e-05, + "loss": 0.0706, + "step": 2140 + }, + { + "epoch": 0.12882737132242794, + "grad_norm": 0.6935863494873047, + "learning_rate": 1.9930370139151283e-05, + "loss": 0.0376, + "step": 2150 + }, + { + "epoch": 0.1294265683983462, + "grad_norm": 0.9824007153511047, + "learning_rate": 1.9929154456460346e-05, + "loss": 0.0418, + "step": 2160 + }, + { + "epoch": 0.13002576547426448, + "grad_norm": 0.9782054424285889, + "learning_rate": 1.9927928294909306e-05, + "loss": 0.0453, + "step": 2170 + }, + { + "epoch": 0.13062496255018274, + "grad_norm": 0.7749345898628235, + "learning_rate": 1.9926691655937126e-05, + "loss": 0.0449, + "step": 2180 + }, + { + "epoch": 0.131224159626101, + "grad_norm": 1.1558616161346436, + "learning_rate": 1.992544454099507e-05, + "loss": 0.051, + "step": 2190 + }, + { + "epoch": 0.1318233567020193, + "grad_norm": 0.33876606822013855, + "learning_rate": 1.9924186951546696e-05, + "loss": 0.0463, + "step": 2200 + }, + { + "epoch": 0.13242255377793757, + "grad_norm": 0.5539175271987915, + "learning_rate": 1.9922918889067863e-05, + "loss": 0.0389, + "step": 2210 + }, + { + "epoch": 0.13302175085385584, + "grad_norm": 0.554639995098114, + "learning_rate": 1.9921640355046706e-05, + "loss": 0.0375, + "step": 2220 + }, + { + "epoch": 0.1336209479297741, + "grad_norm": 0.46284249424934387, + "learning_rate": 1.992035135098366e-05, + "loss": 0.0365, + "step": 2230 + }, + { + "epoch": 0.13422014500569238, + "grad_norm": 0.7209586501121521, + "learning_rate": 1.991905187839144e-05, + "loss": 0.0465, + "step": 2240 + }, + { + "epoch": 0.13481934208161064, + "grad_norm": 1.0352572202682495, + "learning_rate": 1.991774193879505e-05, + "loss": 0.0609, + "step": 2250 + }, + { + "epoch": 0.1354185391575289, + "grad_norm": 0.3893347680568695, + "learning_rate": 1.991642153373178e-05, + "loss": 0.0449, + "step": 2260 + }, + { + "epoch": 0.13601773623344718, + "grad_norm": 0.3959295451641083, + "learning_rate": 1.9915090664751194e-05, + "loss": 0.042, + "step": 2270 + }, + { + "epoch": 0.13661693330936545, + "grad_norm": 0.47758615016937256, + "learning_rate": 1.991374933341515e-05, + "loss": 0.0608, + "step": 2280 + }, + { + "epoch": 0.1372161303852837, + "grad_norm": 0.7173318266868591, + "learning_rate": 1.991239754129776e-05, + "loss": 0.0511, + "step": 2290 + }, + { + "epoch": 0.13781532746120198, + "grad_norm": 0.5889247059822083, + "learning_rate": 1.991103528998544e-05, + "loss": 0.0376, + "step": 2300 + }, + { + "epoch": 0.13841452453712025, + "grad_norm": 0.5986958146095276, + "learning_rate": 1.9909662581076866e-05, + "loss": 0.0348, + "step": 2310 + }, + { + "epoch": 0.13901372161303852, + "grad_norm": 0.9506963491439819, + "learning_rate": 1.990827941618298e-05, + "loss": 0.0513, + "step": 2320 + }, + { + "epoch": 0.1396129186889568, + "grad_norm": 0.8730902671813965, + "learning_rate": 1.9906885796927015e-05, + "loss": 0.0429, + "step": 2330 + }, + { + "epoch": 0.14021211576487508, + "grad_norm": 0.5152983069419861, + "learning_rate": 1.9905481724944453e-05, + "loss": 0.0347, + "step": 2340 + }, + { + "epoch": 0.14081131284079335, + "grad_norm": 0.786233127117157, + "learning_rate": 1.990406720188305e-05, + "loss": 0.0464, + "step": 2350 + }, + { + "epoch": 0.1414105099167116, + "grad_norm": 0.7376151084899902, + "learning_rate": 1.9902642229402834e-05, + "loss": 0.0479, + "step": 2360 + }, + { + "epoch": 0.14200970699262988, + "grad_norm": 0.595055878162384, + "learning_rate": 1.9901206809176085e-05, + "loss": 0.0392, + "step": 2370 + }, + { + "epoch": 0.14260890406854815, + "grad_norm": 0.8207923769950867, + "learning_rate": 1.989976094288735e-05, + "loss": 0.0441, + "step": 2380 + }, + { + "epoch": 0.14320810114446642, + "grad_norm": 0.7003177404403687, + "learning_rate": 1.9898304632233428e-05, + "loss": 0.036, + "step": 2390 + }, + { + "epoch": 0.14380729822038468, + "grad_norm": 0.6637803316116333, + "learning_rate": 1.9896837878923392e-05, + "loss": 0.0507, + "step": 2400 + }, + { + "epoch": 0.14440649529630295, + "grad_norm": 0.5207458138465881, + "learning_rate": 1.9895360684678547e-05, + "loss": 0.0476, + "step": 2410 + }, + { + "epoch": 0.14500569237222122, + "grad_norm": 1.241939663887024, + "learning_rate": 1.989387305123247e-05, + "loss": 0.0466, + "step": 2420 + }, + { + "epoch": 0.14560488944813949, + "grad_norm": 0.7212964296340942, + "learning_rate": 1.9892374980330985e-05, + "loss": 0.0459, + "step": 2430 + }, + { + "epoch": 0.14620408652405775, + "grad_norm": 0.6244897246360779, + "learning_rate": 1.989086647373215e-05, + "loss": 0.0444, + "step": 2440 + }, + { + "epoch": 0.14680328359997602, + "grad_norm": 0.571205198764801, + "learning_rate": 1.988934753320629e-05, + "loss": 0.0611, + "step": 2450 + }, + { + "epoch": 0.14740248067589432, + "grad_norm": 0.8839776515960693, + "learning_rate": 1.9887818160535965e-05, + "loss": 0.0464, + "step": 2460 + }, + { + "epoch": 0.14800167775181258, + "grad_norm": 0.580142080783844, + "learning_rate": 1.988627835751598e-05, + "loss": 0.0434, + "step": 2470 + }, + { + "epoch": 0.14860087482773085, + "grad_norm": 0.6745111346244812, + "learning_rate": 1.9884728125953375e-05, + "loss": 0.0443, + "step": 2480 + }, + { + "epoch": 0.14920007190364912, + "grad_norm": 0.9726730585098267, + "learning_rate": 1.988316746766744e-05, + "loss": 0.0394, + "step": 2490 + }, + { + "epoch": 0.14979926897956738, + "grad_norm": 0.48007458448410034, + "learning_rate": 1.9881596384489683e-05, + "loss": 0.0442, + "step": 2500 + }, + { + "epoch": 0.15039846605548565, + "grad_norm": 0.7205815315246582, + "learning_rate": 1.988001487826387e-05, + "loss": 0.0461, + "step": 2510 + }, + { + "epoch": 0.15099766313140392, + "grad_norm": 0.5800597667694092, + "learning_rate": 1.987842295084598e-05, + "loss": 0.0553, + "step": 2520 + }, + { + "epoch": 0.1515968602073222, + "grad_norm": 0.6497617959976196, + "learning_rate": 1.987682060410423e-05, + "loss": 0.0398, + "step": 2530 + }, + { + "epoch": 0.15219605728324045, + "grad_norm": 0.7487000226974487, + "learning_rate": 1.9875207839919065e-05, + "loss": 0.053, + "step": 2540 + }, + { + "epoch": 0.15279525435915872, + "grad_norm": 0.6686383485794067, + "learning_rate": 1.9873584660183153e-05, + "loss": 0.0494, + "step": 2550 + }, + { + "epoch": 0.153394451435077, + "grad_norm": 0.6101617217063904, + "learning_rate": 1.9871951066801384e-05, + "loss": 0.0397, + "step": 2560 + }, + { + "epoch": 0.15399364851099526, + "grad_norm": 0.49039891362190247, + "learning_rate": 1.987030706169087e-05, + "loss": 0.0406, + "step": 2570 + }, + { + "epoch": 0.15459284558691352, + "grad_norm": 1.076252818107605, + "learning_rate": 1.9868652646780945e-05, + "loss": 0.0472, + "step": 2580 + }, + { + "epoch": 0.1551920426628318, + "grad_norm": 0.7085466980934143, + "learning_rate": 1.986698782401316e-05, + "loss": 0.0481, + "step": 2590 + }, + { + "epoch": 0.1557912397387501, + "grad_norm": 0.6343501210212708, + "learning_rate": 1.9865312595341268e-05, + "loss": 0.0444, + "step": 2600 + }, + { + "epoch": 0.15639043681466835, + "grad_norm": 0.7452435493469238, + "learning_rate": 1.986362696273125e-05, + "loss": 0.0485, + "step": 2610 + }, + { + "epoch": 0.15698963389058662, + "grad_norm": 0.6645557880401611, + "learning_rate": 1.9861930928161288e-05, + "loss": 0.0455, + "step": 2620 + }, + { + "epoch": 0.1575888309665049, + "grad_norm": 0.5987662076950073, + "learning_rate": 1.9860224493621775e-05, + "loss": 0.0384, + "step": 2630 + }, + { + "epoch": 0.15818802804242316, + "grad_norm": 1.078682541847229, + "learning_rate": 1.9858507661115306e-05, + "loss": 0.0416, + "step": 2640 + }, + { + "epoch": 0.15878722511834142, + "grad_norm": 0.8880276083946228, + "learning_rate": 1.985678043265668e-05, + "loss": 0.0427, + "step": 2650 + }, + { + "epoch": 0.1593864221942597, + "grad_norm": 0.8119439482688904, + "learning_rate": 1.985504281027289e-05, + "loss": 0.0516, + "step": 2660 + }, + { + "epoch": 0.15998561927017796, + "grad_norm": 0.5018808245658875, + "learning_rate": 1.9853294796003138e-05, + "loss": 0.035, + "step": 2670 + }, + { + "epoch": 0.16058481634609623, + "grad_norm": 0.623843252658844, + "learning_rate": 1.9851536391898817e-05, + "loss": 0.0468, + "step": 2680 + }, + { + "epoch": 0.1611840134220145, + "grad_norm": 0.48201584815979004, + "learning_rate": 1.9849767600023514e-05, + "loss": 0.0387, + "step": 2690 + }, + { + "epoch": 0.16178321049793276, + "grad_norm": 0.5672967433929443, + "learning_rate": 1.9847988422452998e-05, + "loss": 0.0374, + "step": 2700 + }, + { + "epoch": 0.16238240757385103, + "grad_norm": 0.7304291129112244, + "learning_rate": 1.9846198861275238e-05, + "loss": 0.0458, + "step": 2710 + }, + { + "epoch": 0.1629816046497693, + "grad_norm": 1.1493513584136963, + "learning_rate": 1.984439891859038e-05, + "loss": 0.0495, + "step": 2720 + }, + { + "epoch": 0.1635808017256876, + "grad_norm": 0.8220258951187134, + "learning_rate": 1.9842588596510762e-05, + "loss": 0.0565, + "step": 2730 + }, + { + "epoch": 0.16417999880160586, + "grad_norm": 1.0740118026733398, + "learning_rate": 1.9840767897160894e-05, + "loss": 0.0484, + "step": 2740 + }, + { + "epoch": 0.16477919587752413, + "grad_norm": 0.6214267015457153, + "learning_rate": 1.983893682267747e-05, + "loss": 0.0346, + "step": 2750 + }, + { + "epoch": 0.1653783929534424, + "grad_norm": 0.6255515813827515, + "learning_rate": 1.983709537520935e-05, + "loss": 0.0405, + "step": 2760 + }, + { + "epoch": 0.16597759002936066, + "grad_norm": 1.0625102519989014, + "learning_rate": 1.983524355691759e-05, + "loss": 0.0511, + "step": 2770 + }, + { + "epoch": 0.16657678710527893, + "grad_norm": 0.8623147010803223, + "learning_rate": 1.9833381369975396e-05, + "loss": 0.043, + "step": 2780 + }, + { + "epoch": 0.1671759841811972, + "grad_norm": 0.92961186170578, + "learning_rate": 1.983150881656814e-05, + "loss": 0.0428, + "step": 2790 + }, + { + "epoch": 0.16777518125711546, + "grad_norm": 0.6050530076026917, + "learning_rate": 1.9829625898893375e-05, + "loss": 0.0405, + "step": 2800 + }, + { + "epoch": 0.16837437833303373, + "grad_norm": 0.944632351398468, + "learning_rate": 1.982773261916081e-05, + "loss": 0.0434, + "step": 2810 + }, + { + "epoch": 0.168973575408952, + "grad_norm": 0.4904105067253113, + "learning_rate": 1.9825828979592315e-05, + "loss": 0.0423, + "step": 2820 + }, + { + "epoch": 0.16957277248487027, + "grad_norm": 0.7352654337882996, + "learning_rate": 1.982391498242191e-05, + "loss": 0.0425, + "step": 2830 + }, + { + "epoch": 0.17017196956078853, + "grad_norm": 1.0492011308670044, + "learning_rate": 1.9821990629895787e-05, + "loss": 0.0616, + "step": 2840 + }, + { + "epoch": 0.1707711666367068, + "grad_norm": 0.7823440432548523, + "learning_rate": 1.982005592427227e-05, + "loss": 0.0447, + "step": 2850 + }, + { + "epoch": 0.1713703637126251, + "grad_norm": 0.8018720149993896, + "learning_rate": 1.9818110867821856e-05, + "loss": 0.0371, + "step": 2860 + }, + { + "epoch": 0.17196956078854336, + "grad_norm": 0.49853745102882385, + "learning_rate": 1.9816155462827166e-05, + "loss": 0.036, + "step": 2870 + }, + { + "epoch": 0.17256875786446163, + "grad_norm": 0.8805229663848877, + "learning_rate": 1.9814189711582988e-05, + "loss": 0.0524, + "step": 2880 + }, + { + "epoch": 0.1731679549403799, + "grad_norm": 0.5573164820671082, + "learning_rate": 1.981221361639623e-05, + "loss": 0.0387, + "step": 2890 + }, + { + "epoch": 0.17376715201629817, + "grad_norm": 0.7481330633163452, + "learning_rate": 1.9810227179585956e-05, + "loss": 0.0466, + "step": 2900 + }, + { + "epoch": 0.17436634909221643, + "grad_norm": 0.40816730260849, + "learning_rate": 1.9808230403483355e-05, + "loss": 0.0651, + "step": 2910 + }, + { + "epoch": 0.1749655461681347, + "grad_norm": 0.6791403889656067, + "learning_rate": 1.9806223290431765e-05, + "loss": 0.0393, + "step": 2920 + }, + { + "epoch": 0.17556474324405297, + "grad_norm": 0.7291558384895325, + "learning_rate": 1.980420584278663e-05, + "loss": 0.0521, + "step": 2930 + }, + { + "epoch": 0.17616394031997124, + "grad_norm": 0.6312416791915894, + "learning_rate": 1.9802178062915545e-05, + "loss": 0.0489, + "step": 2940 + }, + { + "epoch": 0.1767631373958895, + "grad_norm": 0.7327824831008911, + "learning_rate": 1.980013995319823e-05, + "loss": 0.0343, + "step": 2950 + }, + { + "epoch": 0.17736233447180777, + "grad_norm": 1.3112396001815796, + "learning_rate": 1.979809151602651e-05, + "loss": 0.0525, + "step": 2960 + }, + { + "epoch": 0.17796153154772604, + "grad_norm": 1.2425460815429688, + "learning_rate": 1.9796032753804343e-05, + "loss": 0.0419, + "step": 2970 + }, + { + "epoch": 0.1785607286236443, + "grad_norm": 0.6839079856872559, + "learning_rate": 1.9793963668947803e-05, + "loss": 0.0491, + "step": 2980 + }, + { + "epoch": 0.1791599256995626, + "grad_norm": 0.7781338691711426, + "learning_rate": 1.9791884263885074e-05, + "loss": 0.0434, + "step": 2990 + }, + { + "epoch": 0.17975912277548087, + "grad_norm": 0.5329035520553589, + "learning_rate": 1.9789794541056456e-05, + "loss": 0.0468, + "step": 3000 + }, + { + "epoch": 0.18035831985139913, + "grad_norm": 0.7196246981620789, + "learning_rate": 1.978769450291435e-05, + "loss": 0.044, + "step": 3010 + }, + { + "epoch": 0.1809575169273174, + "grad_norm": 0.7625473737716675, + "learning_rate": 1.9785584151923272e-05, + "loss": 0.0441, + "step": 3020 + }, + { + "epoch": 0.18155671400323567, + "grad_norm": 0.5458085536956787, + "learning_rate": 1.978346349055984e-05, + "loss": 0.039, + "step": 3030 + }, + { + "epoch": 0.18215591107915394, + "grad_norm": 0.7765107154846191, + "learning_rate": 1.978133252131276e-05, + "loss": 0.0467, + "step": 3040 + }, + { + "epoch": 0.1827551081550722, + "grad_norm": 0.7010345458984375, + "learning_rate": 1.9779191246682853e-05, + "loss": 0.04, + "step": 3050 + }, + { + "epoch": 0.18335430523099047, + "grad_norm": 0.626748263835907, + "learning_rate": 1.9777039669183012e-05, + "loss": 0.0373, + "step": 3060 + }, + { + "epoch": 0.18395350230690874, + "grad_norm": 0.5149411559104919, + "learning_rate": 1.9774877791338244e-05, + "loss": 0.0461, + "step": 3070 + }, + { + "epoch": 0.184552699382827, + "grad_norm": 0.9740221500396729, + "learning_rate": 1.9772705615685625e-05, + "loss": 0.037, + "step": 3080 + }, + { + "epoch": 0.18515189645874527, + "grad_norm": 0.504397988319397, + "learning_rate": 1.9770523144774325e-05, + "loss": 0.054, + "step": 3090 + }, + { + "epoch": 0.18575109353466354, + "grad_norm": 0.5483772158622742, + "learning_rate": 1.9768330381165603e-05, + "loss": 0.0365, + "step": 3100 + }, + { + "epoch": 0.1863502906105818, + "grad_norm": 0.29313552379608154, + "learning_rate": 1.976612732743278e-05, + "loss": 0.038, + "step": 3110 + }, + { + "epoch": 0.18694948768650008, + "grad_norm": 0.8453809022903442, + "learning_rate": 1.9763913986161268e-05, + "loss": 0.0413, + "step": 3120 + }, + { + "epoch": 0.18754868476241837, + "grad_norm": 0.5152369141578674, + "learning_rate": 1.9761690359948545e-05, + "loss": 0.0383, + "step": 3130 + }, + { + "epoch": 0.18814788183833664, + "grad_norm": 0.9969985485076904, + "learning_rate": 1.975945645140416e-05, + "loss": 0.0465, + "step": 3140 + }, + { + "epoch": 0.1887470789142549, + "grad_norm": 0.9506912231445312, + "learning_rate": 1.9757212263149725e-05, + "loss": 0.0377, + "step": 3150 + }, + { + "epoch": 0.18934627599017317, + "grad_norm": 0.9154256582260132, + "learning_rate": 1.975495779781893e-05, + "loss": 0.0428, + "step": 3160 + }, + { + "epoch": 0.18994547306609144, + "grad_norm": 1.2283018827438354, + "learning_rate": 1.9752693058057504e-05, + "loss": 0.0403, + "step": 3170 + }, + { + "epoch": 0.1905446701420097, + "grad_norm": 0.6880149841308594, + "learning_rate": 1.9750418046523253e-05, + "loss": 0.0395, + "step": 3180 + }, + { + "epoch": 0.19114386721792798, + "grad_norm": 0.4900283217430115, + "learning_rate": 1.9748132765886024e-05, + "loss": 0.0368, + "step": 3190 + }, + { + "epoch": 0.19174306429384624, + "grad_norm": 0.7604786157608032, + "learning_rate": 1.9745837218827727e-05, + "loss": 0.0447, + "step": 3200 + }, + { + "epoch": 0.1923422613697645, + "grad_norm": 0.559420108795166, + "learning_rate": 1.974353140804231e-05, + "loss": 0.0456, + "step": 3210 + }, + { + "epoch": 0.19294145844568278, + "grad_norm": 0.5867525339126587, + "learning_rate": 1.9741215336235774e-05, + "loss": 0.0484, + "step": 3220 + }, + { + "epoch": 0.19354065552160105, + "grad_norm": 0.4810929596424103, + "learning_rate": 1.9738889006126157e-05, + "loss": 0.0406, + "step": 3230 + }, + { + "epoch": 0.1941398525975193, + "grad_norm": 0.8294567465782166, + "learning_rate": 1.9736552420443535e-05, + "loss": 0.0405, + "step": 3240 + }, + { + "epoch": 0.19473904967343758, + "grad_norm": 0.8964418172836304, + "learning_rate": 1.9734205581930027e-05, + "loss": 0.0551, + "step": 3250 + }, + { + "epoch": 0.19533824674935588, + "grad_norm": 0.5311513543128967, + "learning_rate": 1.9731848493339765e-05, + "loss": 0.048, + "step": 3260 + }, + { + "epoch": 0.19593744382527414, + "grad_norm": 0.806564450263977, + "learning_rate": 1.972948115743894e-05, + "loss": 0.0422, + "step": 3270 + }, + { + "epoch": 0.1965366409011924, + "grad_norm": 0.6752825975418091, + "learning_rate": 1.9727103577005746e-05, + "loss": 0.0436, + "step": 3280 + }, + { + "epoch": 0.19713583797711068, + "grad_norm": 0.5873673558235168, + "learning_rate": 1.9724715754830404e-05, + "loss": 0.046, + "step": 3290 + }, + { + "epoch": 0.19773503505302895, + "grad_norm": 0.44951826333999634, + "learning_rate": 1.972231769371516e-05, + "loss": 0.0581, + "step": 3300 + }, + { + "epoch": 0.1983342321289472, + "grad_norm": 0.6930672526359558, + "learning_rate": 1.9719909396474268e-05, + "loss": 0.0482, + "step": 3310 + }, + { + "epoch": 0.19893342920486548, + "grad_norm": 0.5176821351051331, + "learning_rate": 1.9717490865934004e-05, + "loss": 0.0469, + "step": 3320 + }, + { + "epoch": 0.19953262628078375, + "grad_norm": 0.49050986766815186, + "learning_rate": 1.9715062104932644e-05, + "loss": 0.0505, + "step": 3330 + }, + { + "epoch": 0.20013182335670202, + "grad_norm": 0.7312544584274292, + "learning_rate": 1.9712623116320478e-05, + "loss": 0.0397, + "step": 3340 + }, + { + "epoch": 0.20073102043262028, + "grad_norm": 0.7582018375396729, + "learning_rate": 1.971017390295979e-05, + "loss": 0.0472, + "step": 3350 + }, + { + "epoch": 0.20133021750853855, + "grad_norm": 0.5867499113082886, + "learning_rate": 1.9707714467724874e-05, + "loss": 0.0402, + "step": 3360 + }, + { + "epoch": 0.20192941458445682, + "grad_norm": 0.5435264706611633, + "learning_rate": 1.970524481350201e-05, + "loss": 0.0357, + "step": 3370 + }, + { + "epoch": 0.20252861166037509, + "grad_norm": 0.7370457053184509, + "learning_rate": 1.9702764943189478e-05, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 0.20312780873629338, + "grad_norm": 0.774713933467865, + "learning_rate": 1.970027485969754e-05, + "loss": 0.0419, + "step": 3390 + }, + { + "epoch": 0.20372700581221165, + "grad_norm": 1.3614526987075806, + "learning_rate": 1.969777456594845e-05, + "loss": 0.0443, + "step": 3400 + }, + { + "epoch": 0.20432620288812992, + "grad_norm": 0.6087996959686279, + "learning_rate": 1.9695264064876445e-05, + "loss": 0.0362, + "step": 3410 + }, + { + "epoch": 0.20492539996404818, + "grad_norm": 0.6685174703598022, + "learning_rate": 1.9692743359427734e-05, + "loss": 0.0437, + "step": 3420 + }, + { + "epoch": 0.20552459703996645, + "grad_norm": 0.9508783221244812, + "learning_rate": 1.9690212452560504e-05, + "loss": 0.0403, + "step": 3430 + }, + { + "epoch": 0.20612379411588472, + "grad_norm": 0.5553990006446838, + "learning_rate": 1.9687671347244916e-05, + "loss": 0.0454, + "step": 3440 + }, + { + "epoch": 0.20672299119180298, + "grad_norm": 0.5054144263267517, + "learning_rate": 1.9685120046463105e-05, + "loss": 0.0651, + "step": 3450 + }, + { + "epoch": 0.20732218826772125, + "grad_norm": 0.42293739318847656, + "learning_rate": 1.9682558553209156e-05, + "loss": 0.0431, + "step": 3460 + }, + { + "epoch": 0.20792138534363952, + "grad_norm": 0.7212286591529846, + "learning_rate": 1.967998687048913e-05, + "loss": 0.0415, + "step": 3470 + }, + { + "epoch": 0.2085205824195578, + "grad_norm": 0.473127543926239, + "learning_rate": 1.9677405001321032e-05, + "loss": 0.046, + "step": 3480 + }, + { + "epoch": 0.20911977949547605, + "grad_norm": 0.6872493028640747, + "learning_rate": 1.9674812948734844e-05, + "loss": 0.031, + "step": 3490 + }, + { + "epoch": 0.20971897657139432, + "grad_norm": 0.5251455903053284, + "learning_rate": 1.9672210715772465e-05, + "loss": 0.0391, + "step": 3500 + }, + { + "epoch": 0.2103181736473126, + "grad_norm": 0.5380337834358215, + "learning_rate": 1.9669598305487772e-05, + "loss": 0.0409, + "step": 3510 + }, + { + "epoch": 0.21091737072323086, + "grad_norm": 0.7052116394042969, + "learning_rate": 1.966697572094658e-05, + "loss": 0.0416, + "step": 3520 + }, + { + "epoch": 0.21151656779914915, + "grad_norm": 0.8229309916496277, + "learning_rate": 1.9664342965226623e-05, + "loss": 0.0372, + "step": 3530 + }, + { + "epoch": 0.21211576487506742, + "grad_norm": 0.9506240487098694, + "learning_rate": 1.9661700041417592e-05, + "loss": 0.0419, + "step": 3540 + }, + { + "epoch": 0.2127149619509857, + "grad_norm": 0.6417449116706848, + "learning_rate": 1.965904695262111e-05, + "loss": 0.0431, + "step": 3550 + }, + { + "epoch": 0.21331415902690395, + "grad_norm": 0.6112877130508423, + "learning_rate": 1.9656383701950722e-05, + "loss": 0.0498, + "step": 3560 + }, + { + "epoch": 0.21391335610282222, + "grad_norm": 1.0621747970581055, + "learning_rate": 1.9653710292531897e-05, + "loss": 0.0478, + "step": 3570 + }, + { + "epoch": 0.2145125531787405, + "grad_norm": 0.7538444995880127, + "learning_rate": 1.9651026727502036e-05, + "loss": 0.0402, + "step": 3580 + }, + { + "epoch": 0.21511175025465876, + "grad_norm": 0.5625021457672119, + "learning_rate": 1.964833301001045e-05, + "loss": 0.048, + "step": 3590 + }, + { + "epoch": 0.21571094733057702, + "grad_norm": 0.47914358973503113, + "learning_rate": 1.9645629143218367e-05, + "loss": 0.0371, + "step": 3600 + }, + { + "epoch": 0.2163101444064953, + "grad_norm": 0.6854084134101868, + "learning_rate": 1.9642915130298918e-05, + "loss": 0.0478, + "step": 3610 + }, + { + "epoch": 0.21690934148241356, + "grad_norm": 0.9252145886421204, + "learning_rate": 1.9640190974437156e-05, + "loss": 0.0368, + "step": 3620 + }, + { + "epoch": 0.21750853855833183, + "grad_norm": 0.8439743518829346, + "learning_rate": 1.963745667883003e-05, + "loss": 0.0417, + "step": 3630 + }, + { + "epoch": 0.2181077356342501, + "grad_norm": 1.0050065517425537, + "learning_rate": 1.9634712246686386e-05, + "loss": 0.0444, + "step": 3640 + }, + { + "epoch": 0.21870693271016836, + "grad_norm": 0.7451267242431641, + "learning_rate": 1.9631957681226973e-05, + "loss": 0.0444, + "step": 3650 + }, + { + "epoch": 0.21930612978608666, + "grad_norm": 0.8371824622154236, + "learning_rate": 1.9629192985684414e-05, + "loss": 0.0413, + "step": 3660 + }, + { + "epoch": 0.21990532686200492, + "grad_norm": 1.0461528301239014, + "learning_rate": 1.9626418163303244e-05, + "loss": 0.0343, + "step": 3670 + }, + { + "epoch": 0.2205045239379232, + "grad_norm": 0.39973369240760803, + "learning_rate": 1.962363321733987e-05, + "loss": 0.0411, + "step": 3680 + }, + { + "epoch": 0.22110372101384146, + "grad_norm": 0.4291725754737854, + "learning_rate": 1.962083815106258e-05, + "loss": 0.035, + "step": 3690 + }, + { + "epoch": 0.22170291808975973, + "grad_norm": 0.7072318196296692, + "learning_rate": 1.9618032967751543e-05, + "loss": 0.0412, + "step": 3700 + }, + { + "epoch": 0.222302115165678, + "grad_norm": 0.5897591710090637, + "learning_rate": 1.9615217670698792e-05, + "loss": 0.0422, + "step": 3710 + }, + { + "epoch": 0.22290131224159626, + "grad_norm": 0.724743664264679, + "learning_rate": 1.9612392263208238e-05, + "loss": 0.0412, + "step": 3720 + }, + { + "epoch": 0.22350050931751453, + "grad_norm": 0.6499989628791809, + "learning_rate": 1.9609556748595653e-05, + "loss": 0.0456, + "step": 3730 + }, + { + "epoch": 0.2240997063934328, + "grad_norm": 0.7375554442405701, + "learning_rate": 1.9606711130188673e-05, + "loss": 0.0481, + "step": 3740 + }, + { + "epoch": 0.22469890346935106, + "grad_norm": 0.5231707096099854, + "learning_rate": 1.960385541132679e-05, + "loss": 0.0444, + "step": 3750 + }, + { + "epoch": 0.22529810054526933, + "grad_norm": 0.6235650777816772, + "learning_rate": 1.9600989595361346e-05, + "loss": 0.0352, + "step": 3760 + }, + { + "epoch": 0.2258972976211876, + "grad_norm": 0.43499720096588135, + "learning_rate": 1.9598113685655543e-05, + "loss": 0.0389, + "step": 3770 + }, + { + "epoch": 0.22649649469710587, + "grad_norm": 0.797736406326294, + "learning_rate": 1.9595227685584414e-05, + "loss": 0.0444, + "step": 3780 + }, + { + "epoch": 0.22709569177302416, + "grad_norm": 1.0550916194915771, + "learning_rate": 1.959233159853484e-05, + "loss": 0.0504, + "step": 3790 + }, + { + "epoch": 0.22769488884894243, + "grad_norm": 0.6214169263839722, + "learning_rate": 1.9589425427905545e-05, + "loss": 0.0406, + "step": 3800 + }, + { + "epoch": 0.2282940859248607, + "grad_norm": 0.698083221912384, + "learning_rate": 1.958650917710708e-05, + "loss": 0.0593, + "step": 3810 + }, + { + "epoch": 0.22889328300077896, + "grad_norm": 0.6379665732383728, + "learning_rate": 1.958358284956183e-05, + "loss": 0.0493, + "step": 3820 + }, + { + "epoch": 0.22949248007669723, + "grad_norm": 0.5507146120071411, + "learning_rate": 1.9580646448704e-05, + "loss": 0.0433, + "step": 3830 + }, + { + "epoch": 0.2300916771526155, + "grad_norm": 0.5956857204437256, + "learning_rate": 1.9577699977979624e-05, + "loss": 0.049, + "step": 3840 + }, + { + "epoch": 0.23069087422853377, + "grad_norm": 0.44772031903266907, + "learning_rate": 1.9574743440846543e-05, + "loss": 0.0479, + "step": 3850 + }, + { + "epoch": 0.23129007130445203, + "grad_norm": 0.9360495209693909, + "learning_rate": 1.9571776840774426e-05, + "loss": 0.0434, + "step": 3860 + }, + { + "epoch": 0.2318892683803703, + "grad_norm": 0.5642439126968384, + "learning_rate": 1.9568800181244737e-05, + "loss": 0.0396, + "step": 3870 + }, + { + "epoch": 0.23248846545628857, + "grad_norm": 0.4046037495136261, + "learning_rate": 1.9565813465750757e-05, + "loss": 0.0408, + "step": 3880 + }, + { + "epoch": 0.23308766253220684, + "grad_norm": 0.5948778986930847, + "learning_rate": 1.9562816697797555e-05, + "loss": 0.0349, + "step": 3890 + }, + { + "epoch": 0.2336868596081251, + "grad_norm": 0.8199960589408875, + "learning_rate": 1.9559809880902017e-05, + "loss": 0.035, + "step": 3900 + }, + { + "epoch": 0.23428605668404337, + "grad_norm": 0.4827987253665924, + "learning_rate": 1.95567930185928e-05, + "loss": 0.0422, + "step": 3910 + }, + { + "epoch": 0.23488525375996167, + "grad_norm": 0.8324541449546814, + "learning_rate": 1.9553766114410362e-05, + "loss": 0.0396, + "step": 3920 + }, + { + "epoch": 0.23548445083587993, + "grad_norm": 0.4008340537548065, + "learning_rate": 1.9550729171906944e-05, + "loss": 0.0399, + "step": 3930 + }, + { + "epoch": 0.2360836479117982, + "grad_norm": 0.6216022372245789, + "learning_rate": 1.9547682194646572e-05, + "loss": 0.0456, + "step": 3940 + }, + { + "epoch": 0.23668284498771647, + "grad_norm": 0.37505266070365906, + "learning_rate": 1.9544625186205043e-05, + "loss": 0.0385, + "step": 3950 + }, + { + "epoch": 0.23728204206363473, + "grad_norm": 0.49176743626594543, + "learning_rate": 1.954155815016992e-05, + "loss": 0.0394, + "step": 3960 + }, + { + "epoch": 0.237881239139553, + "grad_norm": 0.5399725437164307, + "learning_rate": 1.9538481090140542e-05, + "loss": 0.0438, + "step": 3970 + }, + { + "epoch": 0.23848043621547127, + "grad_norm": 0.8310949802398682, + "learning_rate": 1.9535394009728014e-05, + "loss": 0.0412, + "step": 3980 + }, + { + "epoch": 0.23907963329138954, + "grad_norm": 1.1955338716506958, + "learning_rate": 1.9532296912555196e-05, + "loss": 0.0459, + "step": 3990 + }, + { + "epoch": 0.2396788303673078, + "grad_norm": 1.0068060159683228, + "learning_rate": 1.95291898022567e-05, + "loss": 0.0491, + "step": 4000 + }, + { + "epoch": 0.24027802744322607, + "grad_norm": 0.5460902452468872, + "learning_rate": 1.95260726824789e-05, + "loss": 0.0454, + "step": 4010 + }, + { + "epoch": 0.24087722451914434, + "grad_norm": 0.7850955128669739, + "learning_rate": 1.9522945556879906e-05, + "loss": 0.038, + "step": 4020 + }, + { + "epoch": 0.2414764215950626, + "grad_norm": 0.36727651953697205, + "learning_rate": 1.9519808429129572e-05, + "loss": 0.042, + "step": 4030 + }, + { + "epoch": 0.24207561867098087, + "grad_norm": 0.5334084630012512, + "learning_rate": 1.9516661302909498e-05, + "loss": 0.0472, + "step": 4040 + }, + { + "epoch": 0.24267481574689914, + "grad_norm": 0.7271261215209961, + "learning_rate": 1.9513504181913004e-05, + "loss": 0.0382, + "step": 4050 + }, + { + "epoch": 0.24327401282281744, + "grad_norm": 0.5323888063430786, + "learning_rate": 1.9510337069845154e-05, + "loss": 0.0436, + "step": 4060 + }, + { + "epoch": 0.2438732098987357, + "grad_norm": 0.45585381984710693, + "learning_rate": 1.9507159970422727e-05, + "loss": 0.0374, + "step": 4070 + }, + { + "epoch": 0.24447240697465397, + "grad_norm": 0.7871994376182556, + "learning_rate": 1.9503972887374225e-05, + "loss": 0.0523, + "step": 4080 + }, + { + "epoch": 0.24507160405057224, + "grad_norm": 0.5605924129486084, + "learning_rate": 1.950077582443987e-05, + "loss": 0.0394, + "step": 4090 + }, + { + "epoch": 0.2456708011264905, + "grad_norm": 0.6938880085945129, + "learning_rate": 1.9497568785371595e-05, + "loss": 0.0394, + "step": 4100 + }, + { + "epoch": 0.24626999820240877, + "grad_norm": 0.5804795026779175, + "learning_rate": 1.9494351773933035e-05, + "loss": 0.0437, + "step": 4110 + }, + { + "epoch": 0.24686919527832704, + "grad_norm": 1.0168874263763428, + "learning_rate": 1.949112479389954e-05, + "loss": 0.0419, + "step": 4120 + }, + { + "epoch": 0.2474683923542453, + "grad_norm": 0.6860261559486389, + "learning_rate": 1.9487887849058137e-05, + "loss": 0.0381, + "step": 4130 + }, + { + "epoch": 0.24806758943016358, + "grad_norm": 0.7029629349708557, + "learning_rate": 1.9484640943207574e-05, + "loss": 0.0405, + "step": 4140 + }, + { + "epoch": 0.24866678650608184, + "grad_norm": 0.5081820487976074, + "learning_rate": 1.9481384080158267e-05, + "loss": 0.0359, + "step": 4150 + }, + { + "epoch": 0.2492659835820001, + "grad_norm": 0.4721413254737854, + "learning_rate": 1.9478117263732333e-05, + "loss": 0.0445, + "step": 4160 + }, + { + "epoch": 0.24986518065791838, + "grad_norm": 0.36132606863975525, + "learning_rate": 1.9474840497763558e-05, + "loss": 0.0443, + "step": 4170 + }, + { + "epoch": 0.2504643777338367, + "grad_norm": 0.6331628561019897, + "learning_rate": 1.9471553786097414e-05, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 0.25106357480975494, + "grad_norm": 0.5754039287567139, + "learning_rate": 1.9468257132591035e-05, + "loss": 0.0364, + "step": 4190 + }, + { + "epoch": 0.2516627718856732, + "grad_norm": 1.5680726766586304, + "learning_rate": 1.946495054111323e-05, + "loss": 0.0568, + "step": 4200 + }, + { + "epoch": 0.2522619689615915, + "grad_norm": 0.49352893233299255, + "learning_rate": 1.9461634015544467e-05, + "loss": 0.0352, + "step": 4210 + }, + { + "epoch": 0.25286116603750974, + "grad_norm": 0.6292720437049866, + "learning_rate": 1.945830755977688e-05, + "loss": 0.056, + "step": 4220 + }, + { + "epoch": 0.253460363113428, + "grad_norm": 0.7185224294662476, + "learning_rate": 1.945497117771424e-05, + "loss": 0.0463, + "step": 4230 + }, + { + "epoch": 0.2540595601893463, + "grad_norm": 0.5580431222915649, + "learning_rate": 1.9451624873271982e-05, + "loss": 0.0395, + "step": 4240 + }, + { + "epoch": 0.25465875726526455, + "grad_norm": 0.7590157985687256, + "learning_rate": 1.9448268650377187e-05, + "loss": 0.0367, + "step": 4250 + }, + { + "epoch": 0.2552579543411828, + "grad_norm": 0.6500505208969116, + "learning_rate": 1.944490251296856e-05, + "loss": 0.0373, + "step": 4260 + }, + { + "epoch": 0.2558571514171011, + "grad_norm": 0.408975213766098, + "learning_rate": 1.944152646499645e-05, + "loss": 0.0458, + "step": 4270 + }, + { + "epoch": 0.25645634849301935, + "grad_norm": 0.5616204142570496, + "learning_rate": 1.9438140510422846e-05, + "loss": 0.0397, + "step": 4280 + }, + { + "epoch": 0.2570555455689376, + "grad_norm": 0.6361889243125916, + "learning_rate": 1.943474465322135e-05, + "loss": 0.0371, + "step": 4290 + }, + { + "epoch": 0.2576547426448559, + "grad_norm": 0.8486977219581604, + "learning_rate": 1.9431338897377186e-05, + "loss": 0.0428, + "step": 4300 + }, + { + "epoch": 0.25825393972077415, + "grad_norm": 0.7492835521697998, + "learning_rate": 1.9427923246887208e-05, + "loss": 0.0444, + "step": 4310 + }, + { + "epoch": 0.2588531367966924, + "grad_norm": 0.7901867032051086, + "learning_rate": 1.9424497705759858e-05, + "loss": 0.0413, + "step": 4320 + }, + { + "epoch": 0.2594523338726107, + "grad_norm": 0.6845218539237976, + "learning_rate": 1.942106227801521e-05, + "loss": 0.041, + "step": 4330 + }, + { + "epoch": 0.26005153094852895, + "grad_norm": 0.9644033908843994, + "learning_rate": 1.941761696768493e-05, + "loss": 0.0482, + "step": 4340 + }, + { + "epoch": 0.2606507280244472, + "grad_norm": 0.45466694235801697, + "learning_rate": 1.941416177881227e-05, + "loss": 0.0507, + "step": 4350 + }, + { + "epoch": 0.2612499251003655, + "grad_norm": 0.37155815958976746, + "learning_rate": 1.94106967154521e-05, + "loss": 0.0563, + "step": 4360 + }, + { + "epoch": 0.26184912217628376, + "grad_norm": 0.4936427175998688, + "learning_rate": 1.940722178167086e-05, + "loss": 0.0466, + "step": 4370 + }, + { + "epoch": 0.262448319252202, + "grad_norm": 0.6540364027023315, + "learning_rate": 1.940373698154658e-05, + "loss": 0.0426, + "step": 4380 + }, + { + "epoch": 0.26304751632812035, + "grad_norm": 0.38369905948638916, + "learning_rate": 1.940024231916886e-05, + "loss": 0.0423, + "step": 4390 + }, + { + "epoch": 0.2636467134040386, + "grad_norm": 0.5450782179832458, + "learning_rate": 1.939673779863889e-05, + "loss": 0.0499, + "step": 4400 + }, + { + "epoch": 0.2642459104799569, + "grad_norm": 0.24151510000228882, + "learning_rate": 1.9393223424069416e-05, + "loss": 0.0431, + "step": 4410 + }, + { + "epoch": 0.26484510755587515, + "grad_norm": 0.8069043159484863, + "learning_rate": 1.938969919958475e-05, + "loss": 0.0447, + "step": 4420 + }, + { + "epoch": 0.2654443046317934, + "grad_norm": 0.5423257946968079, + "learning_rate": 1.9386165129320767e-05, + "loss": 0.0496, + "step": 4430 + }, + { + "epoch": 0.2660435017077117, + "grad_norm": 0.4058588445186615, + "learning_rate": 1.93826212174249e-05, + "loss": 0.0402, + "step": 4440 + }, + { + "epoch": 0.26664269878362995, + "grad_norm": 0.6126188635826111, + "learning_rate": 1.9379067468056124e-05, + "loss": 0.0458, + "step": 4450 + }, + { + "epoch": 0.2672418958595482, + "grad_norm": 0.7490487694740295, + "learning_rate": 1.9375503885384962e-05, + "loss": 0.0493, + "step": 4460 + }, + { + "epoch": 0.2678410929354665, + "grad_norm": 0.7295238971710205, + "learning_rate": 1.9371930473593474e-05, + "loss": 0.0462, + "step": 4470 + }, + { + "epoch": 0.26844029001138475, + "grad_norm": 0.7178632616996765, + "learning_rate": 1.936834723687526e-05, + "loss": 0.0443, + "step": 4480 + }, + { + "epoch": 0.269039487087303, + "grad_norm": 0.7040836215019226, + "learning_rate": 1.9364754179435445e-05, + "loss": 0.0414, + "step": 4490 + }, + { + "epoch": 0.2696386841632213, + "grad_norm": 0.6338651776313782, + "learning_rate": 1.936115130549069e-05, + "loss": 0.0354, + "step": 4500 + }, + { + "epoch": 0.27023788123913955, + "grad_norm": 1.3360642194747925, + "learning_rate": 1.935753861926916e-05, + "loss": 0.0503, + "step": 4510 + }, + { + "epoch": 0.2708370783150578, + "grad_norm": 0.46927154064178467, + "learning_rate": 1.9353916125010545e-05, + "loss": 0.0379, + "step": 4520 + }, + { + "epoch": 0.2714362753909761, + "grad_norm": 0.7340303659439087, + "learning_rate": 1.9350283826966046e-05, + "loss": 0.0381, + "step": 4530 + }, + { + "epoch": 0.27203547246689436, + "grad_norm": 0.5492366552352905, + "learning_rate": 1.9346641729398364e-05, + "loss": 0.0328, + "step": 4540 + }, + { + "epoch": 0.2726346695428126, + "grad_norm": 0.7509336471557617, + "learning_rate": 1.934298983658171e-05, + "loss": 0.0368, + "step": 4550 + }, + { + "epoch": 0.2732338666187309, + "grad_norm": 0.4471103847026825, + "learning_rate": 1.933932815280178e-05, + "loss": 0.0405, + "step": 4560 + }, + { + "epoch": 0.27383306369464916, + "grad_norm": 0.6582043170928955, + "learning_rate": 1.9335656682355764e-05, + "loss": 0.0422, + "step": 4570 + }, + { + "epoch": 0.2744322607705674, + "grad_norm": 0.6933317184448242, + "learning_rate": 1.933197542955233e-05, + "loss": 0.0347, + "step": 4580 + }, + { + "epoch": 0.2750314578464857, + "grad_norm": 0.450021892786026, + "learning_rate": 1.9328284398711645e-05, + "loss": 0.0362, + "step": 4590 + }, + { + "epoch": 0.27563065492240396, + "grad_norm": 0.5376274585723877, + "learning_rate": 1.932458359416533e-05, + "loss": 0.0619, + "step": 4600 + }, + { + "epoch": 0.27622985199832223, + "grad_norm": 0.722744882106781, + "learning_rate": 1.9320873020256494e-05, + "loss": 0.0446, + "step": 4610 + }, + { + "epoch": 0.2768290490742405, + "grad_norm": 0.6075776219367981, + "learning_rate": 1.9317152681339693e-05, + "loss": 0.047, + "step": 4620 + }, + { + "epoch": 0.27742824615015876, + "grad_norm": 0.6950559020042419, + "learning_rate": 1.9313422581780955e-05, + "loss": 0.0366, + "step": 4630 + }, + { + "epoch": 0.27802744322607703, + "grad_norm": 0.5763269066810608, + "learning_rate": 1.9309682725957766e-05, + "loss": 0.0416, + "step": 4640 + }, + { + "epoch": 0.2786266403019953, + "grad_norm": 0.5462995767593384, + "learning_rate": 1.9305933118259048e-05, + "loss": 0.042, + "step": 4650 + }, + { + "epoch": 0.2792258373779136, + "grad_norm": 0.6304270029067993, + "learning_rate": 1.9302173763085175e-05, + "loss": 0.0388, + "step": 4660 + }, + { + "epoch": 0.2798250344538319, + "grad_norm": 0.6828057765960693, + "learning_rate": 1.929840466484796e-05, + "loss": 0.0324, + "step": 4670 + }, + { + "epoch": 0.28042423152975016, + "grad_norm": 0.37152284383773804, + "learning_rate": 1.9294625827970655e-05, + "loss": 0.0451, + "step": 4680 + }, + { + "epoch": 0.2810234286056684, + "grad_norm": 0.4172256588935852, + "learning_rate": 1.9290837256887928e-05, + "loss": 0.0357, + "step": 4690 + }, + { + "epoch": 0.2816226256815867, + "grad_norm": 0.5640333294868469, + "learning_rate": 1.928703895604588e-05, + "loss": 0.0522, + "step": 4700 + }, + { + "epoch": 0.28222182275750496, + "grad_norm": 0.8016167879104614, + "learning_rate": 1.9283230929902033e-05, + "loss": 0.0381, + "step": 4710 + }, + { + "epoch": 0.2828210198334232, + "grad_norm": 0.591262698173523, + "learning_rate": 1.9279413182925316e-05, + "loss": 0.0382, + "step": 4720 + }, + { + "epoch": 0.2834202169093415, + "grad_norm": 0.5212893486022949, + "learning_rate": 1.9275585719596062e-05, + "loss": 0.0478, + "step": 4730 + }, + { + "epoch": 0.28401941398525976, + "grad_norm": 0.7837402820587158, + "learning_rate": 1.927174854440602e-05, + "loss": 0.0443, + "step": 4740 + }, + { + "epoch": 0.28461861106117803, + "grad_norm": 0.9257993698120117, + "learning_rate": 1.926790166185833e-05, + "loss": 0.0468, + "step": 4750 + }, + { + "epoch": 0.2852178081370963, + "grad_norm": 0.5952717065811157, + "learning_rate": 1.926404507646751e-05, + "loss": 0.033, + "step": 4760 + }, + { + "epoch": 0.28581700521301456, + "grad_norm": 0.9675727486610413, + "learning_rate": 1.9260178792759493e-05, + "loss": 0.0451, + "step": 4770 + }, + { + "epoch": 0.28641620228893283, + "grad_norm": 0.5518060326576233, + "learning_rate": 1.925630281527157e-05, + "loss": 0.039, + "step": 4780 + }, + { + "epoch": 0.2870153993648511, + "grad_norm": 0.9742224216461182, + "learning_rate": 1.9252417148552417e-05, + "loss": 0.0398, + "step": 4790 + }, + { + "epoch": 0.28761459644076937, + "grad_norm": 0.6197847723960876, + "learning_rate": 1.9248521797162085e-05, + "loss": 0.0466, + "step": 4800 + }, + { + "epoch": 0.28821379351668763, + "grad_norm": 0.47963154315948486, + "learning_rate": 1.924461676567198e-05, + "loss": 0.0449, + "step": 4810 + }, + { + "epoch": 0.2888129905926059, + "grad_norm": 0.41337478160858154, + "learning_rate": 1.9240702058664874e-05, + "loss": 0.0441, + "step": 4820 + }, + { + "epoch": 0.28941218766852417, + "grad_norm": 0.7238340973854065, + "learning_rate": 1.92367776807349e-05, + "loss": 0.0438, + "step": 4830 + }, + { + "epoch": 0.29001138474444244, + "grad_norm": 0.9248948097229004, + "learning_rate": 1.9232843636487527e-05, + "loss": 0.059, + "step": 4840 + }, + { + "epoch": 0.2906105818203607, + "grad_norm": 0.6670559048652649, + "learning_rate": 1.9228899930539583e-05, + "loss": 0.0388, + "step": 4850 + }, + { + "epoch": 0.29120977889627897, + "grad_norm": 0.956350564956665, + "learning_rate": 1.922494656751922e-05, + "loss": 0.0402, + "step": 4860 + }, + { + "epoch": 0.29180897597219724, + "grad_norm": 0.6378766894340515, + "learning_rate": 1.922098355206593e-05, + "loss": 0.0377, + "step": 4870 + }, + { + "epoch": 0.2924081730481155, + "grad_norm": 0.9037134647369385, + "learning_rate": 1.9217010888830533e-05, + "loss": 0.046, + "step": 4880 + }, + { + "epoch": 0.2930073701240338, + "grad_norm": 0.7720431685447693, + "learning_rate": 1.9213028582475176e-05, + "loss": 0.0519, + "step": 4890 + }, + { + "epoch": 0.29360656719995204, + "grad_norm": 0.7988153100013733, + "learning_rate": 1.9209036637673308e-05, + "loss": 0.0437, + "step": 4900 + }, + { + "epoch": 0.2942057642758703, + "grad_norm": 0.6672379970550537, + "learning_rate": 1.9205035059109705e-05, + "loss": 0.0432, + "step": 4910 + }, + { + "epoch": 0.29480496135178863, + "grad_norm": 0.8264118432998657, + "learning_rate": 1.9201023851480444e-05, + "loss": 0.0463, + "step": 4920 + }, + { + "epoch": 0.2954041584277069, + "grad_norm": 0.6753244400024414, + "learning_rate": 1.9197003019492895e-05, + "loss": 0.048, + "step": 4930 + }, + { + "epoch": 0.29600335550362517, + "grad_norm": 0.5530163645744324, + "learning_rate": 1.9192972567865732e-05, + "loss": 0.0552, + "step": 4940 + }, + { + "epoch": 0.29660255257954343, + "grad_norm": 1.4215611219406128, + "learning_rate": 1.9188932501328907e-05, + "loss": 0.0537, + "step": 4950 + }, + { + "epoch": 0.2972017496554617, + "grad_norm": 0.8495141267776489, + "learning_rate": 1.9184882824623675e-05, + "loss": 0.0431, + "step": 4960 + }, + { + "epoch": 0.29780094673137997, + "grad_norm": 0.5609806180000305, + "learning_rate": 1.918082354250255e-05, + "loss": 0.0355, + "step": 4970 + }, + { + "epoch": 0.29840014380729823, + "grad_norm": 0.30011680722236633, + "learning_rate": 1.9176754659729327e-05, + "loss": 0.0503, + "step": 4980 + }, + { + "epoch": 0.2989993408832165, + "grad_norm": 0.5155858993530273, + "learning_rate": 1.9172676181079067e-05, + "loss": 0.0402, + "step": 4990 + }, + { + "epoch": 0.29959853795913477, + "grad_norm": 0.48371294140815735, + "learning_rate": 1.9168588111338092e-05, + "loss": 0.0476, + "step": 5000 + }, + { + "epoch": 0.30019773503505304, + "grad_norm": 0.49065709114074707, + "learning_rate": 1.916449045530398e-05, + "loss": 0.0476, + "step": 5010 + }, + { + "epoch": 0.3007969321109713, + "grad_norm": 0.4877799451351166, + "learning_rate": 1.9160383217785554e-05, + "loss": 0.0337, + "step": 5020 + }, + { + "epoch": 0.30139612918688957, + "grad_norm": 0.5917441248893738, + "learning_rate": 1.9156266403602894e-05, + "loss": 0.0379, + "step": 5030 + }, + { + "epoch": 0.30199532626280784, + "grad_norm": 0.42583322525024414, + "learning_rate": 1.9152140017587303e-05, + "loss": 0.045, + "step": 5040 + }, + { + "epoch": 0.3025945233387261, + "grad_norm": 0.6343463659286499, + "learning_rate": 1.914800406458133e-05, + "loss": 0.0449, + "step": 5050 + }, + { + "epoch": 0.3031937204146444, + "grad_norm": 0.8575727343559265, + "learning_rate": 1.9143858549438746e-05, + "loss": 0.0453, + "step": 5060 + }, + { + "epoch": 0.30379291749056264, + "grad_norm": 0.7644649147987366, + "learning_rate": 1.9139703477024538e-05, + "loss": 0.0396, + "step": 5070 + }, + { + "epoch": 0.3043921145664809, + "grad_norm": 0.6534778475761414, + "learning_rate": 1.9135538852214923e-05, + "loss": 0.0354, + "step": 5080 + }, + { + "epoch": 0.3049913116423992, + "grad_norm": 0.3632607161998749, + "learning_rate": 1.9131364679897317e-05, + "loss": 0.035, + "step": 5090 + }, + { + "epoch": 0.30559050871831744, + "grad_norm": 0.9180629849433899, + "learning_rate": 1.912718096497034e-05, + "loss": 0.0371, + "step": 5100 + }, + { + "epoch": 0.3061897057942357, + "grad_norm": 0.48914220929145813, + "learning_rate": 1.912298771234382e-05, + "loss": 0.043, + "step": 5110 + }, + { + "epoch": 0.306788902870154, + "grad_norm": 0.8579902052879333, + "learning_rate": 1.911878492693877e-05, + "loss": 0.0467, + "step": 5120 + }, + { + "epoch": 0.30738809994607225, + "grad_norm": 1.523177146911621, + "learning_rate": 1.911457261368739e-05, + "loss": 0.0478, + "step": 5130 + }, + { + "epoch": 0.3079872970219905, + "grad_norm": 1.2650493383407593, + "learning_rate": 1.911035077753307e-05, + "loss": 0.046, + "step": 5140 + }, + { + "epoch": 0.3085864940979088, + "grad_norm": 0.8262631893157959, + "learning_rate": 1.9106119423430366e-05, + "loss": 0.0345, + "step": 5150 + }, + { + "epoch": 0.30918569117382705, + "grad_norm": 0.8710194826126099, + "learning_rate": 1.910187855634501e-05, + "loss": 0.0432, + "step": 5160 + }, + { + "epoch": 0.3097848882497453, + "grad_norm": 0.8287770748138428, + "learning_rate": 1.9097628181253895e-05, + "loss": 0.037, + "step": 5170 + }, + { + "epoch": 0.3103840853256636, + "grad_norm": 0.7243760824203491, + "learning_rate": 1.9093368303145074e-05, + "loss": 0.0445, + "step": 5180 + }, + { + "epoch": 0.3109832824015819, + "grad_norm": 0.5953600406646729, + "learning_rate": 1.9089098927017748e-05, + "loss": 0.0409, + "step": 5190 + }, + { + "epoch": 0.3115824794775002, + "grad_norm": 0.5678296685218811, + "learning_rate": 1.9084820057882266e-05, + "loss": 0.0405, + "step": 5200 + }, + { + "epoch": 0.31218167655341844, + "grad_norm": 0.764759361743927, + "learning_rate": 1.9080531700760125e-05, + "loss": 0.0399, + "step": 5210 + }, + { + "epoch": 0.3127808736293367, + "grad_norm": 0.5969082713127136, + "learning_rate": 1.907623386068395e-05, + "loss": 0.0345, + "step": 5220 + }, + { + "epoch": 0.313380070705255, + "grad_norm": 0.5686851739883423, + "learning_rate": 1.9071926542697493e-05, + "loss": 0.0415, + "step": 5230 + }, + { + "epoch": 0.31397926778117324, + "grad_norm": 0.7042335867881775, + "learning_rate": 1.9067609751855634e-05, + "loss": 0.0343, + "step": 5240 + }, + { + "epoch": 0.3145784648570915, + "grad_norm": 0.46049684286117554, + "learning_rate": 1.9063283493224366e-05, + "loss": 0.0367, + "step": 5250 + }, + { + "epoch": 0.3151776619330098, + "grad_norm": 0.521037757396698, + "learning_rate": 1.9058947771880793e-05, + "loss": 0.0493, + "step": 5260 + }, + { + "epoch": 0.31577685900892805, + "grad_norm": 0.6116137504577637, + "learning_rate": 1.905460259291313e-05, + "loss": 0.0341, + "step": 5270 + }, + { + "epoch": 0.3163760560848463, + "grad_norm": 0.6932541728019714, + "learning_rate": 1.9050247961420682e-05, + "loss": 0.038, + "step": 5280 + }, + { + "epoch": 0.3169752531607646, + "grad_norm": 0.6795322299003601, + "learning_rate": 1.9045883882513857e-05, + "loss": 0.0555, + "step": 5290 + }, + { + "epoch": 0.31757445023668285, + "grad_norm": 1.5589205026626587, + "learning_rate": 1.9041510361314136e-05, + "loss": 0.0498, + "step": 5300 + }, + { + "epoch": 0.3181736473126011, + "grad_norm": 0.58689945936203, + "learning_rate": 1.90371274029541e-05, + "loss": 0.0432, + "step": 5310 + }, + { + "epoch": 0.3187728443885194, + "grad_norm": 0.7746279239654541, + "learning_rate": 1.9032735012577386e-05, + "loss": 0.0455, + "step": 5320 + }, + { + "epoch": 0.31937204146443765, + "grad_norm": 0.4707143008708954, + "learning_rate": 1.9028333195338716e-05, + "loss": 0.0365, + "step": 5330 + }, + { + "epoch": 0.3199712385403559, + "grad_norm": 0.6717873811721802, + "learning_rate": 1.902392195640386e-05, + "loss": 0.0441, + "step": 5340 + }, + { + "epoch": 0.3205704356162742, + "grad_norm": 1.1001774072647095, + "learning_rate": 1.901950130094966e-05, + "loss": 0.0387, + "step": 5350 + }, + { + "epoch": 0.32116963269219245, + "grad_norm": 0.7521471977233887, + "learning_rate": 1.9015071234163994e-05, + "loss": 0.0389, + "step": 5360 + }, + { + "epoch": 0.3217688297681107, + "grad_norm": 0.6617273092269897, + "learning_rate": 1.9010631761245798e-05, + "loss": 0.045, + "step": 5370 + }, + { + "epoch": 0.322368026844029, + "grad_norm": 1.0732862949371338, + "learning_rate": 1.9006182887405043e-05, + "loss": 0.0461, + "step": 5380 + }, + { + "epoch": 0.32296722391994725, + "grad_norm": 0.43623387813568115, + "learning_rate": 1.9001724617862725e-05, + "loss": 0.0387, + "step": 5390 + }, + { + "epoch": 0.3235664209958655, + "grad_norm": 0.5842541456222534, + "learning_rate": 1.8997256957850872e-05, + "loss": 0.0401, + "step": 5400 + }, + { + "epoch": 0.3241656180717838, + "grad_norm": 0.8832051753997803, + "learning_rate": 1.899277991261253e-05, + "loss": 0.0434, + "step": 5410 + }, + { + "epoch": 0.32476481514770206, + "grad_norm": 0.8454849123954773, + "learning_rate": 1.898829348740177e-05, + "loss": 0.0364, + "step": 5420 + }, + { + "epoch": 0.3253640122236203, + "grad_norm": 0.4587421119213104, + "learning_rate": 1.898379768748365e-05, + "loss": 0.0342, + "step": 5430 + }, + { + "epoch": 0.3259632092995386, + "grad_norm": 0.5914700627326965, + "learning_rate": 1.897929251813425e-05, + "loss": 0.0381, + "step": 5440 + }, + { + "epoch": 0.3265624063754569, + "grad_norm": 0.5075448751449585, + "learning_rate": 1.8974777984640633e-05, + "loss": 0.0614, + "step": 5450 + }, + { + "epoch": 0.3271616034513752, + "grad_norm": 0.6165316700935364, + "learning_rate": 1.8970254092300856e-05, + "loss": 0.0355, + "step": 5460 + }, + { + "epoch": 0.32776080052729345, + "grad_norm": 0.8761339783668518, + "learning_rate": 1.896572084642396e-05, + "loss": 0.0382, + "step": 5470 + }, + { + "epoch": 0.3283599976032117, + "grad_norm": 0.8730667233467102, + "learning_rate": 1.8961178252329964e-05, + "loss": 0.0486, + "step": 5480 + }, + { + "epoch": 0.32895919467913, + "grad_norm": 0.4631735384464264, + "learning_rate": 1.895662631534985e-05, + "loss": 0.0479, + "step": 5490 + }, + { + "epoch": 0.32955839175504825, + "grad_norm": 0.7657212615013123, + "learning_rate": 1.895206504082557e-05, + "loss": 0.0359, + "step": 5500 + }, + { + "epoch": 0.3301575888309665, + "grad_norm": 0.49685898423194885, + "learning_rate": 1.894749443411004e-05, + "loss": 0.037, + "step": 5510 + }, + { + "epoch": 0.3307567859068848, + "grad_norm": 0.8567603230476379, + "learning_rate": 1.8942914500567117e-05, + "loss": 0.0415, + "step": 5520 + }, + { + "epoch": 0.33135598298280305, + "grad_norm": 0.8778802156448364, + "learning_rate": 1.8938325245571605e-05, + "loss": 0.0427, + "step": 5530 + }, + { + "epoch": 0.3319551800587213, + "grad_norm": 0.7849876284599304, + "learning_rate": 1.8933726674509255e-05, + "loss": 0.041, + "step": 5540 + }, + { + "epoch": 0.3325543771346396, + "grad_norm": 0.49304109811782837, + "learning_rate": 1.8929118792776747e-05, + "loss": 0.0406, + "step": 5550 + }, + { + "epoch": 0.33315357421055786, + "grad_norm": 0.6490961909294128, + "learning_rate": 1.892450160578168e-05, + "loss": 0.0457, + "step": 5560 + }, + { + "epoch": 0.3337527712864761, + "grad_norm": 1.1704363822937012, + "learning_rate": 1.891987511894259e-05, + "loss": 0.0489, + "step": 5570 + }, + { + "epoch": 0.3343519683623944, + "grad_norm": 0.6955687403678894, + "learning_rate": 1.891523933768891e-05, + "loss": 0.0426, + "step": 5580 + }, + { + "epoch": 0.33495116543831266, + "grad_norm": 0.9385222792625427, + "learning_rate": 1.891059426746099e-05, + "loss": 0.0397, + "step": 5590 + }, + { + "epoch": 0.3355503625142309, + "grad_norm": 1.0259507894515991, + "learning_rate": 1.8905939913710078e-05, + "loss": 0.0406, + "step": 5600 + }, + { + "epoch": 0.3361495595901492, + "grad_norm": 1.5581048727035522, + "learning_rate": 1.8901276281898317e-05, + "loss": 0.0377, + "step": 5610 + }, + { + "epoch": 0.33674875666606746, + "grad_norm": 1.1154224872589111, + "learning_rate": 1.889660337749874e-05, + "loss": 0.0352, + "step": 5620 + }, + { + "epoch": 0.33734795374198573, + "grad_norm": 0.8913238048553467, + "learning_rate": 1.8891921205995257e-05, + "loss": 0.0372, + "step": 5630 + }, + { + "epoch": 0.337947150817904, + "grad_norm": 0.32929253578186035, + "learning_rate": 1.888722977288266e-05, + "loss": 0.0302, + "step": 5640 + }, + { + "epoch": 0.33854634789382226, + "grad_norm": 0.7686375379562378, + "learning_rate": 1.888252908366661e-05, + "loss": 0.0531, + "step": 5650 + }, + { + "epoch": 0.33914554496974053, + "grad_norm": 0.7077587246894836, + "learning_rate": 1.8877819143863623e-05, + "loss": 0.0404, + "step": 5660 + }, + { + "epoch": 0.3397447420456588, + "grad_norm": 0.7370178699493408, + "learning_rate": 1.887309995900108e-05, + "loss": 0.0379, + "step": 5670 + }, + { + "epoch": 0.34034393912157707, + "grad_norm": 0.8013477325439453, + "learning_rate": 1.88683715346172e-05, + "loss": 0.0391, + "step": 5680 + }, + { + "epoch": 0.34094313619749533, + "grad_norm": 0.9743591547012329, + "learning_rate": 1.8863633876261056e-05, + "loss": 0.0466, + "step": 5690 + }, + { + "epoch": 0.3415423332734136, + "grad_norm": 0.6816489100456238, + "learning_rate": 1.8858886989492557e-05, + "loss": 0.0509, + "step": 5700 + }, + { + "epoch": 0.34214153034933187, + "grad_norm": 0.7814317345619202, + "learning_rate": 1.885413087988244e-05, + "loss": 0.0449, + "step": 5710 + }, + { + "epoch": 0.3427407274252502, + "grad_norm": 0.6797910332679749, + "learning_rate": 1.8849365553012257e-05, + "loss": 0.041, + "step": 5720 + }, + { + "epoch": 0.34333992450116846, + "grad_norm": 0.7159250378608704, + "learning_rate": 1.884459101447439e-05, + "loss": 0.0408, + "step": 5730 + }, + { + "epoch": 0.3439391215770867, + "grad_norm": 0.7630175352096558, + "learning_rate": 1.8839807269872027e-05, + "loss": 0.0403, + "step": 5740 + }, + { + "epoch": 0.344538318653005, + "grad_norm": 0.7929314374923706, + "learning_rate": 1.8835014324819155e-05, + "loss": 0.0468, + "step": 5750 + }, + { + "epoch": 0.34513751572892326, + "grad_norm": 0.5765302181243896, + "learning_rate": 1.8830212184940565e-05, + "loss": 0.0382, + "step": 5760 + }, + { + "epoch": 0.34573671280484153, + "grad_norm": 0.5043740272521973, + "learning_rate": 1.882540085587183e-05, + "loss": 0.0447, + "step": 5770 + }, + { + "epoch": 0.3463359098807598, + "grad_norm": 0.7895818948745728, + "learning_rate": 1.8820580343259322e-05, + "loss": 0.0381, + "step": 5780 + }, + { + "epoch": 0.34693510695667806, + "grad_norm": 0.8037170767784119, + "learning_rate": 1.881575065276017e-05, + "loss": 0.0434, + "step": 5790 + }, + { + "epoch": 0.34753430403259633, + "grad_norm": 1.0758732557296753, + "learning_rate": 1.8810911790042297e-05, + "loss": 0.0369, + "step": 5800 + }, + { + "epoch": 0.3481335011085146, + "grad_norm": 0.6673927307128906, + "learning_rate": 1.880606376078437e-05, + "loss": 0.0475, + "step": 5810 + }, + { + "epoch": 0.34873269818443287, + "grad_norm": 0.6661775708198547, + "learning_rate": 1.880120657067582e-05, + "loss": 0.0478, + "step": 5820 + }, + { + "epoch": 0.34933189526035113, + "grad_norm": 0.6422731280326843, + "learning_rate": 1.8796340225416837e-05, + "loss": 0.0397, + "step": 5830 + }, + { + "epoch": 0.3499310923362694, + "grad_norm": 0.6632615923881531, + "learning_rate": 1.8791464730718342e-05, + "loss": 0.0377, + "step": 5840 + }, + { + "epoch": 0.35053028941218767, + "grad_norm": 0.5715954899787903, + "learning_rate": 1.8786580092302e-05, + "loss": 0.0306, + "step": 5850 + }, + { + "epoch": 0.35112948648810594, + "grad_norm": 0.3375200629234314, + "learning_rate": 1.878168631590021e-05, + "loss": 0.0385, + "step": 5860 + }, + { + "epoch": 0.3517286835640242, + "grad_norm": 0.42938506603240967, + "learning_rate": 1.877678340725609e-05, + "loss": 0.0359, + "step": 5870 + }, + { + "epoch": 0.35232788063994247, + "grad_norm": 0.453436940908432, + "learning_rate": 1.8771871372123474e-05, + "loss": 0.0498, + "step": 5880 + }, + { + "epoch": 0.35292707771586074, + "grad_norm": 0.763883113861084, + "learning_rate": 1.8766950216266914e-05, + "loss": 0.037, + "step": 5890 + }, + { + "epoch": 0.353526274791779, + "grad_norm": 0.9350517392158508, + "learning_rate": 1.8762019945461655e-05, + "loss": 0.0524, + "step": 5900 + }, + { + "epoch": 0.3541254718676973, + "grad_norm": 0.6795313358306885, + "learning_rate": 1.875708056549365e-05, + "loss": 0.0336, + "step": 5910 + }, + { + "epoch": 0.35472466894361554, + "grad_norm": 0.4761887788772583, + "learning_rate": 1.875213208215953e-05, + "loss": 0.04, + "step": 5920 + }, + { + "epoch": 0.3553238660195338, + "grad_norm": 0.6547576189041138, + "learning_rate": 1.874717450126662e-05, + "loss": 0.0359, + "step": 5930 + }, + { + "epoch": 0.3559230630954521, + "grad_norm": 0.7119831442832947, + "learning_rate": 1.8742207828632912e-05, + "loss": 0.0382, + "step": 5940 + }, + { + "epoch": 0.35652226017137034, + "grad_norm": 0.5195598602294922, + "learning_rate": 1.8737232070087082e-05, + "loss": 0.0577, + "step": 5950 + }, + { + "epoch": 0.3571214572472886, + "grad_norm": 0.44893282651901245, + "learning_rate": 1.8732247231468455e-05, + "loss": 0.034, + "step": 5960 + }, + { + "epoch": 0.3577206543232069, + "grad_norm": 0.5159012079238892, + "learning_rate": 1.8727253318627016e-05, + "loss": 0.0374, + "step": 5970 + }, + { + "epoch": 0.3583198513991252, + "grad_norm": 0.6474353075027466, + "learning_rate": 1.8722250337423396e-05, + "loss": 0.0275, + "step": 5980 + }, + { + "epoch": 0.35891904847504347, + "grad_norm": 0.5070436000823975, + "learning_rate": 1.871723829372888e-05, + "loss": 0.0382, + "step": 5990 + }, + { + "epoch": 0.35951824555096173, + "grad_norm": 0.28868627548217773, + "learning_rate": 1.8712217193425378e-05, + "loss": 0.0442, + "step": 6000 + }, + { + "epoch": 0.36011744262688, + "grad_norm": 0.3915226459503174, + "learning_rate": 1.870718704240543e-05, + "loss": 0.0439, + "step": 6010 + }, + { + "epoch": 0.36071663970279827, + "grad_norm": 0.6271824836730957, + "learning_rate": 1.87021478465722e-05, + "loss": 0.0395, + "step": 6020 + }, + { + "epoch": 0.36131583677871654, + "grad_norm": 1.2117619514465332, + "learning_rate": 1.869709961183946e-05, + "loss": 0.0409, + "step": 6030 + }, + { + "epoch": 0.3619150338546348, + "grad_norm": 0.4455721378326416, + "learning_rate": 1.86920423441316e-05, + "loss": 0.0399, + "step": 6040 + }, + { + "epoch": 0.36251423093055307, + "grad_norm": 0.4492298662662506, + "learning_rate": 1.8686976049383603e-05, + "loss": 0.0445, + "step": 6050 + }, + { + "epoch": 0.36311342800647134, + "grad_norm": 0.32646581530570984, + "learning_rate": 1.8681900733541047e-05, + "loss": 0.0435, + "step": 6060 + }, + { + "epoch": 0.3637126250823896, + "grad_norm": 0.4477322995662689, + "learning_rate": 1.86768164025601e-05, + "loss": 0.0383, + "step": 6070 + }, + { + "epoch": 0.3643118221583079, + "grad_norm": 0.6562448740005493, + "learning_rate": 1.8671723062407506e-05, + "loss": 0.0317, + "step": 6080 + }, + { + "epoch": 0.36491101923422614, + "grad_norm": 0.25427868962287903, + "learning_rate": 1.8666620719060587e-05, + "loss": 0.0326, + "step": 6090 + }, + { + "epoch": 0.3655102163101444, + "grad_norm": 0.6234788298606873, + "learning_rate": 1.8661509378507225e-05, + "loss": 0.0328, + "step": 6100 + }, + { + "epoch": 0.3661094133860627, + "grad_norm": 0.4264411926269531, + "learning_rate": 1.865638904674586e-05, + "loss": 0.0379, + "step": 6110 + }, + { + "epoch": 0.36670861046198094, + "grad_norm": 0.5537038445472717, + "learning_rate": 1.865125972978549e-05, + "loss": 0.0383, + "step": 6120 + }, + { + "epoch": 0.3673078075378992, + "grad_norm": 0.5042442679405212, + "learning_rate": 1.864612143364565e-05, + "loss": 0.0339, + "step": 6130 + }, + { + "epoch": 0.3679070046138175, + "grad_norm": 0.4152010679244995, + "learning_rate": 1.8640974164356425e-05, + "loss": 0.0324, + "step": 6140 + }, + { + "epoch": 0.36850620168973575, + "grad_norm": 0.6834092736244202, + "learning_rate": 1.8635817927958416e-05, + "loss": 0.0364, + "step": 6150 + }, + { + "epoch": 0.369105398765654, + "grad_norm": 0.6276392340660095, + "learning_rate": 1.8630652730502752e-05, + "loss": 0.0336, + "step": 6160 + }, + { + "epoch": 0.3697045958415723, + "grad_norm": 0.687937319278717, + "learning_rate": 1.8625478578051085e-05, + "loss": 0.0415, + "step": 6170 + }, + { + "epoch": 0.37030379291749055, + "grad_norm": 0.48481765389442444, + "learning_rate": 1.8620295476675565e-05, + "loss": 0.0376, + "step": 6180 + }, + { + "epoch": 0.3709029899934088, + "grad_norm": 1.1335153579711914, + "learning_rate": 1.8615103432458853e-05, + "loss": 0.0421, + "step": 6190 + }, + { + "epoch": 0.3715021870693271, + "grad_norm": 0.6853719353675842, + "learning_rate": 1.8609902451494106e-05, + "loss": 0.043, + "step": 6200 + }, + { + "epoch": 0.37210138414524535, + "grad_norm": 0.97500079870224, + "learning_rate": 1.860469253988496e-05, + "loss": 0.0334, + "step": 6210 + }, + { + "epoch": 0.3727005812211636, + "grad_norm": 0.2953243553638458, + "learning_rate": 1.8599473703745537e-05, + "loss": 0.0334, + "step": 6220 + }, + { + "epoch": 0.3732997782970819, + "grad_norm": 0.6563237309455872, + "learning_rate": 1.8594245949200437e-05, + "loss": 0.0349, + "step": 6230 + }, + { + "epoch": 0.37389897537300015, + "grad_norm": 0.4983973205089569, + "learning_rate": 1.8589009282384714e-05, + "loss": 0.0441, + "step": 6240 + }, + { + "epoch": 0.3744981724489185, + "grad_norm": 0.42969775199890137, + "learning_rate": 1.8583763709443892e-05, + "loss": 0.0319, + "step": 6250 + }, + { + "epoch": 0.37509736952483674, + "grad_norm": 0.8316324353218079, + "learning_rate": 1.8578509236533943e-05, + "loss": 0.0359, + "step": 6260 + }, + { + "epoch": 0.375696566600755, + "grad_norm": 0.4386466443538666, + "learning_rate": 1.8573245869821278e-05, + "loss": 0.0371, + "step": 6270 + }, + { + "epoch": 0.3762957636766733, + "grad_norm": 0.5664681792259216, + "learning_rate": 1.8567973615482764e-05, + "loss": 0.0359, + "step": 6280 + }, + { + "epoch": 0.37689496075259155, + "grad_norm": 0.5660601854324341, + "learning_rate": 1.8562692479705674e-05, + "loss": 0.0297, + "step": 6290 + }, + { + "epoch": 0.3774941578285098, + "grad_norm": 0.6432987451553345, + "learning_rate": 1.8557402468687716e-05, + "loss": 0.0447, + "step": 6300 + }, + { + "epoch": 0.3780933549044281, + "grad_norm": 0.6026568412780762, + "learning_rate": 1.8552103588637015e-05, + "loss": 0.0382, + "step": 6310 + }, + { + "epoch": 0.37869255198034635, + "grad_norm": 0.5358585119247437, + "learning_rate": 1.8546795845772105e-05, + "loss": 0.0366, + "step": 6320 + }, + { + "epoch": 0.3792917490562646, + "grad_norm": 0.3575671315193176, + "learning_rate": 1.8541479246321912e-05, + "loss": 0.0394, + "step": 6330 + }, + { + "epoch": 0.3798909461321829, + "grad_norm": 0.6645073890686035, + "learning_rate": 1.8536153796525767e-05, + "loss": 0.0391, + "step": 6340 + }, + { + "epoch": 0.38049014320810115, + "grad_norm": 0.6527594923973083, + "learning_rate": 1.8530819502633373e-05, + "loss": 0.0334, + "step": 6350 + }, + { + "epoch": 0.3810893402840194, + "grad_norm": 0.5664045810699463, + "learning_rate": 1.852547637090483e-05, + "loss": 0.0426, + "step": 6360 + }, + { + "epoch": 0.3816885373599377, + "grad_norm": 0.3317505419254303, + "learning_rate": 1.85201244076106e-05, + "loss": 0.0366, + "step": 6370 + }, + { + "epoch": 0.38228773443585595, + "grad_norm": 0.7218614220619202, + "learning_rate": 1.8514763619031505e-05, + "loss": 0.0399, + "step": 6380 + }, + { + "epoch": 0.3828869315117742, + "grad_norm": 0.6683867573738098, + "learning_rate": 1.8509394011458736e-05, + "loss": 0.0385, + "step": 6390 + }, + { + "epoch": 0.3834861285876925, + "grad_norm": 0.6589217185974121, + "learning_rate": 1.8504015591193817e-05, + "loss": 0.0445, + "step": 6400 + }, + { + "epoch": 0.38408532566361075, + "grad_norm": 0.39663317799568176, + "learning_rate": 1.849862836454863e-05, + "loss": 0.0515, + "step": 6410 + }, + { + "epoch": 0.384684522739529, + "grad_norm": 0.9468401074409485, + "learning_rate": 1.8493232337845385e-05, + "loss": 0.0421, + "step": 6420 + }, + { + "epoch": 0.3852837198154473, + "grad_norm": 1.0980640649795532, + "learning_rate": 1.848782751741662e-05, + "loss": 0.0431, + "step": 6430 + }, + { + "epoch": 0.38588291689136556, + "grad_norm": 1.4567275047302246, + "learning_rate": 1.848241390960519e-05, + "loss": 0.0467, + "step": 6440 + }, + { + "epoch": 0.3864821139672838, + "grad_norm": 0.3785778284072876, + "learning_rate": 1.847699152076427e-05, + "loss": 0.0437, + "step": 6450 + }, + { + "epoch": 0.3870813110432021, + "grad_norm": 0.8112056255340576, + "learning_rate": 1.8471560357257337e-05, + "loss": 0.0406, + "step": 6460 + }, + { + "epoch": 0.38768050811912036, + "grad_norm": 0.8885411024093628, + "learning_rate": 1.8466120425458155e-05, + "loss": 0.0452, + "step": 6470 + }, + { + "epoch": 0.3882797051950386, + "grad_norm": 0.3356691002845764, + "learning_rate": 1.8460671731750792e-05, + "loss": 0.033, + "step": 6480 + }, + { + "epoch": 0.3888789022709569, + "grad_norm": 0.7636258602142334, + "learning_rate": 1.8455214282529592e-05, + "loss": 0.039, + "step": 6490 + }, + { + "epoch": 0.38947809934687516, + "grad_norm": 0.5050523281097412, + "learning_rate": 1.844974808419918e-05, + "loss": 0.0331, + "step": 6500 + }, + { + "epoch": 0.39007729642279343, + "grad_norm": 0.3761812150478363, + "learning_rate": 1.844427314317444e-05, + "loss": 0.0346, + "step": 6510 + }, + { + "epoch": 0.39067649349871175, + "grad_norm": 0.560323178768158, + "learning_rate": 1.843878946588052e-05, + "loss": 0.0417, + "step": 6520 + }, + { + "epoch": 0.39127569057463, + "grad_norm": 0.5850566625595093, + "learning_rate": 1.8433297058752828e-05, + "loss": 0.0366, + "step": 6530 + }, + { + "epoch": 0.3918748876505483, + "grad_norm": 0.4377721846103668, + "learning_rate": 1.8427795928237e-05, + "loss": 0.0315, + "step": 6540 + }, + { + "epoch": 0.39247408472646655, + "grad_norm": 0.5460193157196045, + "learning_rate": 1.842228608078892e-05, + "loss": 0.0465, + "step": 6550 + }, + { + "epoch": 0.3930732818023848, + "grad_norm": 0.3818223476409912, + "learning_rate": 1.8416767522874708e-05, + "loss": 0.0313, + "step": 6560 + }, + { + "epoch": 0.3936724788783031, + "grad_norm": 0.566722571849823, + "learning_rate": 1.8411240260970692e-05, + "loss": 0.037, + "step": 6570 + }, + { + "epoch": 0.39427167595422136, + "grad_norm": 0.970040500164032, + "learning_rate": 1.8405704301563424e-05, + "loss": 0.0354, + "step": 6580 + }, + { + "epoch": 0.3948708730301396, + "grad_norm": 0.4968736171722412, + "learning_rate": 1.8400159651149665e-05, + "loss": 0.0376, + "step": 6590 + }, + { + "epoch": 0.3954700701060579, + "grad_norm": 0.5235893130302429, + "learning_rate": 1.8394606316236368e-05, + "loss": 0.0383, + "step": 6600 + }, + { + "epoch": 0.39606926718197616, + "grad_norm": 0.853208065032959, + "learning_rate": 1.8389044303340676e-05, + "loss": 0.0384, + "step": 6610 + }, + { + "epoch": 0.3966684642578944, + "grad_norm": 0.4627811312675476, + "learning_rate": 1.838347361898993e-05, + "loss": 0.0615, + "step": 6620 + }, + { + "epoch": 0.3972676613338127, + "grad_norm": 0.4883791208267212, + "learning_rate": 1.837789426972163e-05, + "loss": 0.0307, + "step": 6630 + }, + { + "epoch": 0.39786685840973096, + "grad_norm": 0.4702740013599396, + "learning_rate": 1.8372306262083456e-05, + "loss": 0.0539, + "step": 6640 + }, + { + "epoch": 0.39846605548564923, + "grad_norm": 0.5020611882209778, + "learning_rate": 1.8366709602633252e-05, + "loss": 0.0378, + "step": 6650 + }, + { + "epoch": 0.3990652525615675, + "grad_norm": 0.706611692905426, + "learning_rate": 1.8361104297939e-05, + "loss": 0.0309, + "step": 6660 + }, + { + "epoch": 0.39966444963748576, + "grad_norm": 0.6137747764587402, + "learning_rate": 1.8355490354578844e-05, + "loss": 0.0364, + "step": 6670 + }, + { + "epoch": 0.40026364671340403, + "grad_norm": 0.45299193263053894, + "learning_rate": 1.8349867779141056e-05, + "loss": 0.0359, + "step": 6680 + }, + { + "epoch": 0.4008628437893223, + "grad_norm": 0.31410297751426697, + "learning_rate": 1.8344236578224044e-05, + "loss": 0.0425, + "step": 6690 + }, + { + "epoch": 0.40146204086524057, + "grad_norm": 0.48510870337486267, + "learning_rate": 1.8338596758436333e-05, + "loss": 0.04, + "step": 6700 + }, + { + "epoch": 0.40206123794115883, + "grad_norm": 0.4697261154651642, + "learning_rate": 1.8332948326396567e-05, + "loss": 0.0401, + "step": 6710 + }, + { + "epoch": 0.4026604350170771, + "grad_norm": 0.8231471180915833, + "learning_rate": 1.8327291288733496e-05, + "loss": 0.0346, + "step": 6720 + }, + { + "epoch": 0.40325963209299537, + "grad_norm": 0.9511741995811462, + "learning_rate": 1.832162565208597e-05, + "loss": 0.038, + "step": 6730 + }, + { + "epoch": 0.40385882916891364, + "grad_norm": 0.4473752975463867, + "learning_rate": 1.8315951423102923e-05, + "loss": 0.0421, + "step": 6740 + }, + { + "epoch": 0.4044580262448319, + "grad_norm": 0.5309840440750122, + "learning_rate": 1.831026860844339e-05, + "loss": 0.0375, + "step": 6750 + }, + { + "epoch": 0.40505722332075017, + "grad_norm": 1.1700010299682617, + "learning_rate": 1.8304577214776464e-05, + "loss": 0.0424, + "step": 6760 + }, + { + "epoch": 0.40565642039666844, + "grad_norm": 0.5007262229919434, + "learning_rate": 1.8298877248781307e-05, + "loss": 0.0389, + "step": 6770 + }, + { + "epoch": 0.40625561747258676, + "grad_norm": 0.8835527300834656, + "learning_rate": 1.8293168717147152e-05, + "loss": 0.0348, + "step": 6780 + }, + { + "epoch": 0.40685481454850503, + "grad_norm": 0.6059357523918152, + "learning_rate": 1.828745162657328e-05, + "loss": 0.0367, + "step": 6790 + }, + { + "epoch": 0.4074540116244233, + "grad_norm": 0.37744027376174927, + "learning_rate": 1.828172598376902e-05, + "loss": 0.0391, + "step": 6800 + }, + { + "epoch": 0.40805320870034156, + "grad_norm": 0.5641717910766602, + "learning_rate": 1.827599179545372e-05, + "loss": 0.0383, + "step": 6810 + }, + { + "epoch": 0.40865240577625983, + "grad_norm": 0.4394749104976654, + "learning_rate": 1.827024906835678e-05, + "loss": 0.0394, + "step": 6820 + }, + { + "epoch": 0.4092516028521781, + "grad_norm": 0.7094572186470032, + "learning_rate": 1.82644978092176e-05, + "loss": 0.0384, + "step": 6830 + }, + { + "epoch": 0.40985079992809637, + "grad_norm": 0.6306723952293396, + "learning_rate": 1.825873802478562e-05, + "loss": 0.0347, + "step": 6840 + }, + { + "epoch": 0.41044999700401463, + "grad_norm": 0.4480315148830414, + "learning_rate": 1.825296972182025e-05, + "loss": 0.0415, + "step": 6850 + }, + { + "epoch": 0.4110491940799329, + "grad_norm": 1.014607310295105, + "learning_rate": 1.8247192907090922e-05, + "loss": 0.0426, + "step": 6860 + }, + { + "epoch": 0.41164839115585117, + "grad_norm": 0.7599517107009888, + "learning_rate": 1.8241407587377054e-05, + "loss": 0.0433, + "step": 6870 + }, + { + "epoch": 0.41224758823176944, + "grad_norm": 1.0942739248275757, + "learning_rate": 1.8235613769468034e-05, + "loss": 0.0378, + "step": 6880 + }, + { + "epoch": 0.4128467853076877, + "grad_norm": 0.47618037462234497, + "learning_rate": 1.8229811460163232e-05, + "loss": 0.0312, + "step": 6890 + }, + { + "epoch": 0.41344598238360597, + "grad_norm": 0.6470023393630981, + "learning_rate": 1.8224000666271983e-05, + "loss": 0.0382, + "step": 6900 + }, + { + "epoch": 0.41404517945952424, + "grad_norm": 0.6031871438026428, + "learning_rate": 1.8218181394613578e-05, + "loss": 0.0336, + "step": 6910 + }, + { + "epoch": 0.4146443765354425, + "grad_norm": 0.7470970749855042, + "learning_rate": 1.821235365201725e-05, + "loss": 0.0318, + "step": 6920 + }, + { + "epoch": 0.4152435736113608, + "grad_norm": 0.46166181564331055, + "learning_rate": 1.820651744532219e-05, + "loss": 0.0361, + "step": 6930 + }, + { + "epoch": 0.41584277068727904, + "grad_norm": 0.5585920214653015, + "learning_rate": 1.82006727813775e-05, + "loss": 0.0443, + "step": 6940 + }, + { + "epoch": 0.4164419677631973, + "grad_norm": 0.5172198414802551, + "learning_rate": 1.819481966704223e-05, + "loss": 0.0396, + "step": 6950 + }, + { + "epoch": 0.4170411648391156, + "grad_norm": 0.4908123314380646, + "learning_rate": 1.8188958109185325e-05, + "loss": 0.0294, + "step": 6960 + }, + { + "epoch": 0.41764036191503384, + "grad_norm": 0.5269665122032166, + "learning_rate": 1.8183088114685658e-05, + "loss": 0.0343, + "step": 6970 + }, + { + "epoch": 0.4182395589909521, + "grad_norm": 0.747257649898529, + "learning_rate": 1.8177209690431992e-05, + "loss": 0.0395, + "step": 6980 + }, + { + "epoch": 0.4188387560668704, + "grad_norm": 0.6794129610061646, + "learning_rate": 1.8171322843322984e-05, + "loss": 0.0471, + "step": 6990 + }, + { + "epoch": 0.41943795314278864, + "grad_norm": 0.4291394054889679, + "learning_rate": 1.8165427580267176e-05, + "loss": 0.0388, + "step": 7000 + }, + { + "epoch": 0.4200371502187069, + "grad_norm": 0.8051080107688904, + "learning_rate": 1.815952390818299e-05, + "loss": 0.045, + "step": 7010 + }, + { + "epoch": 0.4206363472946252, + "grad_norm": 0.557299792766571, + "learning_rate": 1.8153611833998715e-05, + "loss": 0.0384, + "step": 7020 + }, + { + "epoch": 0.42123554437054345, + "grad_norm": 0.37832972407341003, + "learning_rate": 1.8147691364652496e-05, + "loss": 0.0333, + "step": 7030 + }, + { + "epoch": 0.4218347414464617, + "grad_norm": 0.30844688415527344, + "learning_rate": 1.8141762507092336e-05, + "loss": 0.033, + "step": 7040 + }, + { + "epoch": 0.42243393852238004, + "grad_norm": 0.3014371395111084, + "learning_rate": 1.813582526827608e-05, + "loss": 0.0344, + "step": 7050 + }, + { + "epoch": 0.4230331355982983, + "grad_norm": 0.778361439704895, + "learning_rate": 1.8129879655171402e-05, + "loss": 0.0351, + "step": 7060 + }, + { + "epoch": 0.42363233267421657, + "grad_norm": 1.14492666721344, + "learning_rate": 1.8123925674755817e-05, + "loss": 0.0462, + "step": 7070 + }, + { + "epoch": 0.42423152975013484, + "grad_norm": 0.35099321603775024, + "learning_rate": 1.8117963334016652e-05, + "loss": 0.0371, + "step": 7080 + }, + { + "epoch": 0.4248307268260531, + "grad_norm": 0.8470032215118408, + "learning_rate": 1.8111992639951047e-05, + "loss": 0.0339, + "step": 7090 + }, + { + "epoch": 0.4254299239019714, + "grad_norm": 0.641718327999115, + "learning_rate": 1.810601359956594e-05, + "loss": 0.0363, + "step": 7100 + }, + { + "epoch": 0.42602912097788964, + "grad_norm": 0.6668172478675842, + "learning_rate": 1.810002621987807e-05, + "loss": 0.0383, + "step": 7110 + }, + { + "epoch": 0.4266283180538079, + "grad_norm": 0.9396918416023254, + "learning_rate": 1.809403050791396e-05, + "loss": 0.0401, + "step": 7120 + }, + { + "epoch": 0.4272275151297262, + "grad_norm": 0.5773718953132629, + "learning_rate": 1.8088026470709915e-05, + "loss": 0.0356, + "step": 7130 + }, + { + "epoch": 0.42782671220564444, + "grad_norm": 0.6474881172180176, + "learning_rate": 1.8082014115312005e-05, + "loss": 0.0487, + "step": 7140 + }, + { + "epoch": 0.4284259092815627, + "grad_norm": 0.5183063745498657, + "learning_rate": 1.807599344877606e-05, + "loss": 0.037, + "step": 7150 + }, + { + "epoch": 0.429025106357481, + "grad_norm": 0.7699562311172485, + "learning_rate": 1.8069964478167673e-05, + "loss": 0.0487, + "step": 7160 + }, + { + "epoch": 0.42962430343339925, + "grad_norm": 0.6379490494728088, + "learning_rate": 1.806392721056217e-05, + "loss": 0.0407, + "step": 7170 + }, + { + "epoch": 0.4302235005093175, + "grad_norm": 0.4757876396179199, + "learning_rate": 1.8057881653044628e-05, + "loss": 0.0307, + "step": 7180 + }, + { + "epoch": 0.4308226975852358, + "grad_norm": 0.47382786870002747, + "learning_rate": 1.8051827812709847e-05, + "loss": 0.0367, + "step": 7190 + }, + { + "epoch": 0.43142189466115405, + "grad_norm": 0.6868136525154114, + "learning_rate": 1.804576569666234e-05, + "loss": 0.0311, + "step": 7200 + }, + { + "epoch": 0.4320210917370723, + "grad_norm": 0.5475189089775085, + "learning_rate": 1.803969531201634e-05, + "loss": 0.0293, + "step": 7210 + }, + { + "epoch": 0.4326202888129906, + "grad_norm": 1.013775110244751, + "learning_rate": 1.803361666589578e-05, + "loss": 0.0383, + "step": 7220 + }, + { + "epoch": 0.43321948588890885, + "grad_norm": 0.46351560950279236, + "learning_rate": 1.802752976543429e-05, + "loss": 0.0404, + "step": 7230 + }, + { + "epoch": 0.4338186829648271, + "grad_norm": 0.4883617162704468, + "learning_rate": 1.8021434617775192e-05, + "loss": 0.0408, + "step": 7240 + }, + { + "epoch": 0.4344178800407454, + "grad_norm": 0.6282979249954224, + "learning_rate": 1.8015331230071474e-05, + "loss": 0.0428, + "step": 7250 + }, + { + "epoch": 0.43501707711666365, + "grad_norm": 1.0833567380905151, + "learning_rate": 1.8009219609485805e-05, + "loss": 0.0394, + "step": 7260 + }, + { + "epoch": 0.4356162741925819, + "grad_norm": 0.46628257632255554, + "learning_rate": 1.8003099763190507e-05, + "loss": 0.0405, + "step": 7270 + }, + { + "epoch": 0.4362154712685002, + "grad_norm": 0.7581565380096436, + "learning_rate": 1.7996971698367574e-05, + "loss": 0.0534, + "step": 7280 + }, + { + "epoch": 0.43681466834441846, + "grad_norm": 0.7900646328926086, + "learning_rate": 1.799083542220862e-05, + "loss": 0.0432, + "step": 7290 + }, + { + "epoch": 0.4374138654203367, + "grad_norm": 0.6033529043197632, + "learning_rate": 1.7984690941914905e-05, + "loss": 0.0438, + "step": 7300 + }, + { + "epoch": 0.43801306249625505, + "grad_norm": 0.924926221370697, + "learning_rate": 1.7978538264697326e-05, + "loss": 0.0347, + "step": 7310 + }, + { + "epoch": 0.4386122595721733, + "grad_norm": 0.8485580682754517, + "learning_rate": 1.797237739777639e-05, + "loss": 0.0523, + "step": 7320 + }, + { + "epoch": 0.4392114566480916, + "grad_norm": 0.3205278217792511, + "learning_rate": 1.7966208348382218e-05, + "loss": 0.0334, + "step": 7330 + }, + { + "epoch": 0.43981065372400985, + "grad_norm": 0.5392606854438782, + "learning_rate": 1.7960031123754528e-05, + "loss": 0.03, + "step": 7340 + }, + { + "epoch": 0.4404098507999281, + "grad_norm": 0.6815987229347229, + "learning_rate": 1.795384573114265e-05, + "loss": 0.0385, + "step": 7350 + }, + { + "epoch": 0.4410090478758464, + "grad_norm": 0.9605218768119812, + "learning_rate": 1.794765217780547e-05, + "loss": 0.0359, + "step": 7360 + }, + { + "epoch": 0.44160824495176465, + "grad_norm": 0.5565723776817322, + "learning_rate": 1.794145047101148e-05, + "loss": 0.0391, + "step": 7370 + }, + { + "epoch": 0.4422074420276829, + "grad_norm": 0.7528144717216492, + "learning_rate": 1.793524061803872e-05, + "loss": 0.0431, + "step": 7380 + }, + { + "epoch": 0.4428066391036012, + "grad_norm": 0.5746167898178101, + "learning_rate": 1.792902262617481e-05, + "loss": 0.0346, + "step": 7390 + }, + { + "epoch": 0.44340583617951945, + "grad_norm": 0.5058369636535645, + "learning_rate": 1.7922796502716896e-05, + "loss": 0.0346, + "step": 7400 + }, + { + "epoch": 0.4440050332554377, + "grad_norm": 1.1387027502059937, + "learning_rate": 1.791656225497169e-05, + "loss": 0.0372, + "step": 7410 + }, + { + "epoch": 0.444604230331356, + "grad_norm": 0.819324254989624, + "learning_rate": 1.7910319890255428e-05, + "loss": 0.0374, + "step": 7420 + }, + { + "epoch": 0.44520342740727425, + "grad_norm": 0.45600345730781555, + "learning_rate": 1.7904069415893867e-05, + "loss": 0.0344, + "step": 7430 + }, + { + "epoch": 0.4458026244831925, + "grad_norm": 0.7428935766220093, + "learning_rate": 1.7897810839222294e-05, + "loss": 0.0373, + "step": 7440 + }, + { + "epoch": 0.4464018215591108, + "grad_norm": 0.6960753202438354, + "learning_rate": 1.7891544167585496e-05, + "loss": 0.0387, + "step": 7450 + }, + { + "epoch": 0.44700101863502906, + "grad_norm": 0.6637990474700928, + "learning_rate": 1.7885269408337753e-05, + "loss": 0.0404, + "step": 7460 + }, + { + "epoch": 0.4476002157109473, + "grad_norm": 0.5612137317657471, + "learning_rate": 1.7878986568842853e-05, + "loss": 0.0375, + "step": 7470 + }, + { + "epoch": 0.4481994127868656, + "grad_norm": 0.6323001384735107, + "learning_rate": 1.7872695656474057e-05, + "loss": 0.0379, + "step": 7480 + }, + { + "epoch": 0.44879860986278386, + "grad_norm": 0.35169267654418945, + "learning_rate": 1.7866396678614098e-05, + "loss": 0.0371, + "step": 7490 + }, + { + "epoch": 0.4493978069387021, + "grad_norm": 0.38252803683280945, + "learning_rate": 1.786008964265518e-05, + "loss": 0.0457, + "step": 7500 + }, + { + "epoch": 0.4499970040146204, + "grad_norm": 0.38694459199905396, + "learning_rate": 1.7853774555998963e-05, + "loss": 0.0345, + "step": 7510 + }, + { + "epoch": 0.45059620109053866, + "grad_norm": 0.37036198377609253, + "learning_rate": 1.784745142605655e-05, + "loss": 0.0292, + "step": 7520 + }, + { + "epoch": 0.45119539816645693, + "grad_norm": 0.8060199618339539, + "learning_rate": 1.7841120260248483e-05, + "loss": 0.0398, + "step": 7530 + }, + { + "epoch": 0.4517945952423752, + "grad_norm": 0.44252580404281616, + "learning_rate": 1.7834781066004743e-05, + "loss": 0.0373, + "step": 7540 + }, + { + "epoch": 0.45239379231829346, + "grad_norm": 0.5565180778503418, + "learning_rate": 1.7828433850764724e-05, + "loss": 0.0345, + "step": 7550 + }, + { + "epoch": 0.45299298939421173, + "grad_norm": 0.4460795521736145, + "learning_rate": 1.782207862197724e-05, + "loss": 0.0404, + "step": 7560 + }, + { + "epoch": 0.45359218647013, + "grad_norm": 0.7309815883636475, + "learning_rate": 1.7815715387100505e-05, + "loss": 0.0364, + "step": 7570 + }, + { + "epoch": 0.4541913835460483, + "grad_norm": 0.6990997195243835, + "learning_rate": 1.7809344153602126e-05, + "loss": 0.0561, + "step": 7580 + }, + { + "epoch": 0.4547905806219666, + "grad_norm": 0.4198327660560608, + "learning_rate": 1.7802964928959106e-05, + "loss": 0.0401, + "step": 7590 + }, + { + "epoch": 0.45538977769788486, + "grad_norm": 0.5436407923698425, + "learning_rate": 1.7796577720657816e-05, + "loss": 0.04, + "step": 7600 + }, + { + "epoch": 0.4559889747738031, + "grad_norm": 0.48884230852127075, + "learning_rate": 1.7790182536194002e-05, + "loss": 0.0334, + "step": 7610 + }, + { + "epoch": 0.4565881718497214, + "grad_norm": 0.6440362930297852, + "learning_rate": 1.778377938307277e-05, + "loss": 0.0451, + "step": 7620 + }, + { + "epoch": 0.45718736892563966, + "grad_norm": 0.9092825055122375, + "learning_rate": 1.777736826880858e-05, + "loss": 0.0398, + "step": 7630 + }, + { + "epoch": 0.4577865660015579, + "grad_norm": 0.4839508533477783, + "learning_rate": 1.7770949200925224e-05, + "loss": 0.0411, + "step": 7640 + }, + { + "epoch": 0.4583857630774762, + "grad_norm": 0.8128801584243774, + "learning_rate": 1.776452218695584e-05, + "loss": 0.038, + "step": 7650 + }, + { + "epoch": 0.45898496015339446, + "grad_norm": 0.5291397571563721, + "learning_rate": 1.7758087234442887e-05, + "loss": 0.0394, + "step": 7660 + }, + { + "epoch": 0.45958415722931273, + "grad_norm": 0.6852243542671204, + "learning_rate": 1.775164435093814e-05, + "loss": 0.0418, + "step": 7670 + }, + { + "epoch": 0.460183354305231, + "grad_norm": 0.6294205188751221, + "learning_rate": 1.774519354400268e-05, + "loss": 0.0374, + "step": 7680 + }, + { + "epoch": 0.46078255138114926, + "grad_norm": 0.5221384763717651, + "learning_rate": 1.7738734821206892e-05, + "loss": 0.0321, + "step": 7690 + }, + { + "epoch": 0.46138174845706753, + "grad_norm": 0.398296982049942, + "learning_rate": 1.7732268190130445e-05, + "loss": 0.0349, + "step": 7700 + }, + { + "epoch": 0.4619809455329858, + "grad_norm": 0.43008267879486084, + "learning_rate": 1.7725793658362287e-05, + "loss": 0.0426, + "step": 7710 + }, + { + "epoch": 0.46258014260890407, + "grad_norm": 0.6012991070747375, + "learning_rate": 1.771931123350065e-05, + "loss": 0.0411, + "step": 7720 + }, + { + "epoch": 0.46317933968482233, + "grad_norm": 0.45076051354408264, + "learning_rate": 1.7712820923153012e-05, + "loss": 0.037, + "step": 7730 + }, + { + "epoch": 0.4637785367607406, + "grad_norm": 0.6742259860038757, + "learning_rate": 1.770632273493612e-05, + "loss": 0.0357, + "step": 7740 + }, + { + "epoch": 0.46437773383665887, + "grad_norm": 0.5989789962768555, + "learning_rate": 1.7699816676475955e-05, + "loss": 0.037, + "step": 7750 + }, + { + "epoch": 0.46497693091257714, + "grad_norm": 0.4041040241718292, + "learning_rate": 1.769330275540774e-05, + "loss": 0.0325, + "step": 7760 + }, + { + "epoch": 0.4655761279884954, + "grad_norm": 0.4937855899333954, + "learning_rate": 1.768678097937593e-05, + "loss": 0.0354, + "step": 7770 + }, + { + "epoch": 0.46617532506441367, + "grad_norm": 0.5446217656135559, + "learning_rate": 1.7680251356034185e-05, + "loss": 0.0374, + "step": 7780 + }, + { + "epoch": 0.46677452214033194, + "grad_norm": 0.7479701638221741, + "learning_rate": 1.767371389304538e-05, + "loss": 0.0415, + "step": 7790 + }, + { + "epoch": 0.4673737192162502, + "grad_norm": 0.7822495102882385, + "learning_rate": 1.7667168598081604e-05, + "loss": 0.0341, + "step": 7800 + }, + { + "epoch": 0.4679729162921685, + "grad_norm": 0.3672648072242737, + "learning_rate": 1.7660615478824116e-05, + "loss": 0.035, + "step": 7810 + }, + { + "epoch": 0.46857211336808674, + "grad_norm": 0.5219965577125549, + "learning_rate": 1.765405454296337e-05, + "loss": 0.0443, + "step": 7820 + }, + { + "epoch": 0.469171310444005, + "grad_norm": 0.4092100262641907, + "learning_rate": 1.7647485798198983e-05, + "loss": 0.0331, + "step": 7830 + }, + { + "epoch": 0.46977050751992333, + "grad_norm": 0.5316944122314453, + "learning_rate": 1.7640909252239754e-05, + "loss": 0.0406, + "step": 7840 + }, + { + "epoch": 0.4703697045958416, + "grad_norm": 1.072263240814209, + "learning_rate": 1.7634324912803613e-05, + "loss": 0.0521, + "step": 7850 + }, + { + "epoch": 0.47096890167175987, + "grad_norm": 0.7448581457138062, + "learning_rate": 1.7627732787617657e-05, + "loss": 0.0362, + "step": 7860 + }, + { + "epoch": 0.47156809874767813, + "grad_norm": 0.44557711482048035, + "learning_rate": 1.7621132884418116e-05, + "loss": 0.0326, + "step": 7870 + }, + { + "epoch": 0.4721672958235964, + "grad_norm": 0.4298631250858307, + "learning_rate": 1.7614525210950334e-05, + "loss": 0.0365, + "step": 7880 + }, + { + "epoch": 0.47276649289951467, + "grad_norm": 0.45413365960121155, + "learning_rate": 1.760790977496879e-05, + "loss": 0.0351, + "step": 7890 + }, + { + "epoch": 0.47336568997543294, + "grad_norm": 0.9562819004058838, + "learning_rate": 1.7601286584237064e-05, + "loss": 0.0394, + "step": 7900 + }, + { + "epoch": 0.4739648870513512, + "grad_norm": 0.9481335878372192, + "learning_rate": 1.7594655646527844e-05, + "loss": 0.0381, + "step": 7910 + }, + { + "epoch": 0.47456408412726947, + "grad_norm": 0.5020818114280701, + "learning_rate": 1.75880169696229e-05, + "loss": 0.0402, + "step": 7920 + }, + { + "epoch": 0.47516328120318774, + "grad_norm": 0.6412234902381897, + "learning_rate": 1.758137056131309e-05, + "loss": 0.037, + "step": 7930 + }, + { + "epoch": 0.475762478279106, + "grad_norm": 0.5511493682861328, + "learning_rate": 1.7574716429398345e-05, + "loss": 0.0535, + "step": 7940 + }, + { + "epoch": 0.47636167535502427, + "grad_norm": 0.5222594141960144, + "learning_rate": 1.756805458168766e-05, + "loss": 0.0401, + "step": 7950 + }, + { + "epoch": 0.47696087243094254, + "grad_norm": 0.7247362732887268, + "learning_rate": 1.7561385025999083e-05, + "loss": 0.0391, + "step": 7960 + }, + { + "epoch": 0.4775600695068608, + "grad_norm": 0.447127103805542, + "learning_rate": 1.755470777015971e-05, + "loss": 0.0383, + "step": 7970 + }, + { + "epoch": 0.4781592665827791, + "grad_norm": 0.4780801832675934, + "learning_rate": 1.754802282200567e-05, + "loss": 0.041, + "step": 7980 + }, + { + "epoch": 0.47875846365869734, + "grad_norm": 0.2962804138660431, + "learning_rate": 1.7541330189382126e-05, + "loss": 0.0422, + "step": 7990 + }, + { + "epoch": 0.4793576607346156, + "grad_norm": 0.5125643014907837, + "learning_rate": 1.7534629880143254e-05, + "loss": 0.0337, + "step": 8000 + }, + { + "epoch": 0.4799568578105339, + "grad_norm": 0.4288216829299927, + "learning_rate": 1.752792190215224e-05, + "loss": 0.0374, + "step": 8010 + }, + { + "epoch": 0.48055605488645214, + "grad_norm": 0.4114690124988556, + "learning_rate": 1.7521206263281273e-05, + "loss": 0.0296, + "step": 8020 + }, + { + "epoch": 0.4811552519623704, + "grad_norm": 0.3511301577091217, + "learning_rate": 1.7514482971411516e-05, + "loss": 0.0315, + "step": 8030 + }, + { + "epoch": 0.4817544490382887, + "grad_norm": 0.8624657392501831, + "learning_rate": 1.7507752034433144e-05, + "loss": 0.0369, + "step": 8040 + }, + { + "epoch": 0.48235364611420695, + "grad_norm": 0.5518651008605957, + "learning_rate": 1.7501013460245275e-05, + "loss": 0.0364, + "step": 8050 + }, + { + "epoch": 0.4829528431901252, + "grad_norm": 0.5404661297798157, + "learning_rate": 1.7494267256756008e-05, + "loss": 0.0294, + "step": 8060 + }, + { + "epoch": 0.4835520402660435, + "grad_norm": 0.7494591474533081, + "learning_rate": 1.7487513431882385e-05, + "loss": 0.0315, + "step": 8070 + }, + { + "epoch": 0.48415123734196175, + "grad_norm": 0.9748606085777283, + "learning_rate": 1.7480751993550392e-05, + "loss": 0.0429, + "step": 8080 + }, + { + "epoch": 0.48475043441788, + "grad_norm": 0.8071768879890442, + "learning_rate": 1.747398294969496e-05, + "loss": 0.0321, + "step": 8090 + }, + { + "epoch": 0.4853496314937983, + "grad_norm": 0.5210712552070618, + "learning_rate": 1.746720630825994e-05, + "loss": 0.0355, + "step": 8100 + }, + { + "epoch": 0.4859488285697166, + "grad_norm": 0.6077958941459656, + "learning_rate": 1.7460422077198088e-05, + "loss": 0.0426, + "step": 8110 + }, + { + "epoch": 0.4865480256456349, + "grad_norm": 0.8688217997550964, + "learning_rate": 1.745363026447109e-05, + "loss": 0.0366, + "step": 8120 + }, + { + "epoch": 0.48714722272155314, + "grad_norm": 2.7064969539642334, + "learning_rate": 1.7446830878049507e-05, + "loss": 0.0465, + "step": 8130 + }, + { + "epoch": 0.4877464197974714, + "grad_norm": 0.5701860785484314, + "learning_rate": 1.74400239259128e-05, + "loss": 0.0365, + "step": 8140 + }, + { + "epoch": 0.4883456168733897, + "grad_norm": 0.6350638270378113, + "learning_rate": 1.743320941604931e-05, + "loss": 0.0419, + "step": 8150 + }, + { + "epoch": 0.48894481394930794, + "grad_norm": 0.42818939685821533, + "learning_rate": 1.7426387356456242e-05, + "loss": 0.0412, + "step": 8160 + }, + { + "epoch": 0.4895440110252262, + "grad_norm": 0.6915261745452881, + "learning_rate": 1.741955775513966e-05, + "loss": 0.0327, + "step": 8170 + }, + { + "epoch": 0.4901432081011445, + "grad_norm": 0.9861057996749878, + "learning_rate": 1.7412720620114486e-05, + "loss": 0.034, + "step": 8180 + }, + { + "epoch": 0.49074240517706275, + "grad_norm": 0.6910699009895325, + "learning_rate": 1.7405875959404475e-05, + "loss": 0.0463, + "step": 8190 + }, + { + "epoch": 0.491341602252981, + "grad_norm": 0.6368144750595093, + "learning_rate": 1.739902378104222e-05, + "loss": 0.0399, + "step": 8200 + }, + { + "epoch": 0.4919407993288993, + "grad_norm": 1.1909242868423462, + "learning_rate": 1.739216409306913e-05, + "loss": 0.042, + "step": 8210 + }, + { + "epoch": 0.49253999640481755, + "grad_norm": 0.6449970006942749, + "learning_rate": 1.738529690353544e-05, + "loss": 0.0388, + "step": 8220 + }, + { + "epoch": 0.4931391934807358, + "grad_norm": 0.531061053276062, + "learning_rate": 1.737842222050017e-05, + "loss": 0.0389, + "step": 8230 + }, + { + "epoch": 0.4937383905566541, + "grad_norm": 0.8275352716445923, + "learning_rate": 1.7371540052031148e-05, + "loss": 0.0503, + "step": 8240 + }, + { + "epoch": 0.49433758763257235, + "grad_norm": 0.8468548655509949, + "learning_rate": 1.7364650406204977e-05, + "loss": 0.0336, + "step": 8250 + }, + { + "epoch": 0.4949367847084906, + "grad_norm": 0.2949988842010498, + "learning_rate": 1.735775329110705e-05, + "loss": 0.0342, + "step": 8260 + }, + { + "epoch": 0.4955359817844089, + "grad_norm": 0.30603477358818054, + "learning_rate": 1.7350848714831505e-05, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.49613517886032715, + "grad_norm": 0.7177753448486328, + "learning_rate": 1.7343936685481254e-05, + "loss": 0.0381, + "step": 8280 + }, + { + "epoch": 0.4967343759362454, + "grad_norm": 0.4893733859062195, + "learning_rate": 1.7337017211167946e-05, + "loss": 0.0319, + "step": 8290 + }, + { + "epoch": 0.4973335730121637, + "grad_norm": 0.6618909239768982, + "learning_rate": 1.733009030001197e-05, + "loss": 0.0317, + "step": 8300 + }, + { + "epoch": 0.49793277008808196, + "grad_norm": 0.5965152382850647, + "learning_rate": 1.732315596014244e-05, + "loss": 0.0293, + "step": 8310 + }, + { + "epoch": 0.4985319671640002, + "grad_norm": 0.4357168674468994, + "learning_rate": 1.7316214199697196e-05, + "loss": 0.0478, + "step": 8320 + }, + { + "epoch": 0.4991311642399185, + "grad_norm": 0.9539002776145935, + "learning_rate": 1.7309265026822773e-05, + "loss": 0.0444, + "step": 8330 + }, + { + "epoch": 0.49973036131583676, + "grad_norm": 0.7171940207481384, + "learning_rate": 1.7302308449674417e-05, + "loss": 0.037, + "step": 8340 + }, + { + "epoch": 0.5003295583917551, + "grad_norm": 0.5711817741394043, + "learning_rate": 1.7295344476416057e-05, + "loss": 0.034, + "step": 8350 + }, + { + "epoch": 0.5009287554676733, + "grad_norm": 0.4134632647037506, + "learning_rate": 1.7288373115220304e-05, + "loss": 0.0389, + "step": 8360 + }, + { + "epoch": 0.5015279525435916, + "grad_norm": 0.39306095242500305, + "learning_rate": 1.728139437426844e-05, + "loss": 0.0351, + "step": 8370 + }, + { + "epoch": 0.5021271496195099, + "grad_norm": 0.318985253572464, + "learning_rate": 1.7274408261750403e-05, + "loss": 0.0425, + "step": 8380 + }, + { + "epoch": 0.5027263466954282, + "grad_norm": 0.7674827575683594, + "learning_rate": 1.7267414785864787e-05, + "loss": 0.041, + "step": 8390 + }, + { + "epoch": 0.5033255437713464, + "grad_norm": 0.7754977941513062, + "learning_rate": 1.726041395481883e-05, + "loss": 0.0436, + "step": 8400 + }, + { + "epoch": 0.5039247408472647, + "grad_norm": 0.5827674269676208, + "learning_rate": 1.7253405776828387e-05, + "loss": 0.0371, + "step": 8410 + }, + { + "epoch": 0.504523937923183, + "grad_norm": 0.3957774341106415, + "learning_rate": 1.7246390260117954e-05, + "loss": 0.0401, + "step": 8420 + }, + { + "epoch": 0.5051231349991012, + "grad_norm": 0.47415387630462646, + "learning_rate": 1.7239367412920626e-05, + "loss": 0.0344, + "step": 8430 + }, + { + "epoch": 0.5057223320750195, + "grad_norm": 0.6292631030082703, + "learning_rate": 1.723233724347811e-05, + "loss": 0.0417, + "step": 8440 + }, + { + "epoch": 0.5063215291509378, + "grad_norm": 0.5913583636283875, + "learning_rate": 1.7225299760040695e-05, + "loss": 0.0385, + "step": 8450 + }, + { + "epoch": 0.506920726226856, + "grad_norm": 0.465749055147171, + "learning_rate": 1.7218254970867265e-05, + "loss": 0.0402, + "step": 8460 + }, + { + "epoch": 0.5075199233027743, + "grad_norm": 0.7115443348884583, + "learning_rate": 1.7211202884225267e-05, + "loss": 0.0372, + "step": 8470 + }, + { + "epoch": 0.5081191203786926, + "grad_norm": 0.7476089596748352, + "learning_rate": 1.7204143508390728e-05, + "loss": 0.042, + "step": 8480 + }, + { + "epoch": 0.5087183174546108, + "grad_norm": 0.5902891159057617, + "learning_rate": 1.719707685164821e-05, + "loss": 0.0319, + "step": 8490 + }, + { + "epoch": 0.5093175145305291, + "grad_norm": 0.7117035984992981, + "learning_rate": 1.7190002922290827e-05, + "loss": 0.0312, + "step": 8500 + }, + { + "epoch": 0.5099167116064474, + "grad_norm": 0.7726907730102539, + "learning_rate": 1.7182921728620233e-05, + "loss": 0.0381, + "step": 8510 + }, + { + "epoch": 0.5105159086823656, + "grad_norm": 0.7318345308303833, + "learning_rate": 1.7175833278946604e-05, + "loss": 0.0464, + "step": 8520 + }, + { + "epoch": 0.5111151057582839, + "grad_norm": 0.8139578104019165, + "learning_rate": 1.716873758158863e-05, + "loss": 0.0334, + "step": 8530 + }, + { + "epoch": 0.5117143028342022, + "grad_norm": 0.6128831505775452, + "learning_rate": 1.7161634644873506e-05, + "loss": 0.0338, + "step": 8540 + }, + { + "epoch": 0.5123134999101204, + "grad_norm": 0.478384405374527, + "learning_rate": 1.715452447713692e-05, + "loss": 0.0361, + "step": 8550 + }, + { + "epoch": 0.5129126969860387, + "grad_norm": 0.36900776624679565, + "learning_rate": 1.714740708672306e-05, + "loss": 0.0473, + "step": 8560 + }, + { + "epoch": 0.513511894061957, + "grad_norm": 1.031351923942566, + "learning_rate": 1.714028248198457e-05, + "loss": 0.0417, + "step": 8570 + }, + { + "epoch": 0.5141110911378752, + "grad_norm": 0.5248333215713501, + "learning_rate": 1.7133150671282576e-05, + "loss": 0.0402, + "step": 8580 + }, + { + "epoch": 0.5147102882137935, + "grad_norm": 0.6325647830963135, + "learning_rate": 1.7126011662986652e-05, + "loss": 0.047, + "step": 8590 + }, + { + "epoch": 0.5153094852897118, + "grad_norm": 0.8417870402336121, + "learning_rate": 1.7118865465474824e-05, + "loss": 0.0406, + "step": 8600 + }, + { + "epoch": 0.51590868236563, + "grad_norm": 0.617125391960144, + "learning_rate": 1.711171208713355e-05, + "loss": 0.0385, + "step": 8610 + }, + { + "epoch": 0.5165078794415483, + "grad_norm": 0.4480224847793579, + "learning_rate": 1.7104551536357727e-05, + "loss": 0.0391, + "step": 8620 + }, + { + "epoch": 0.5171070765174666, + "grad_norm": 1.0203324556350708, + "learning_rate": 1.7097383821550646e-05, + "loss": 0.0379, + "step": 8630 + }, + { + "epoch": 0.5177062735933848, + "grad_norm": 0.6231842637062073, + "learning_rate": 1.7090208951124033e-05, + "loss": 0.0318, + "step": 8640 + }, + { + "epoch": 0.5183054706693031, + "grad_norm": 0.37685611844062805, + "learning_rate": 1.708302693349799e-05, + "loss": 0.0304, + "step": 8650 + }, + { + "epoch": 0.5189046677452214, + "grad_norm": 1.0700500011444092, + "learning_rate": 1.707583777710101e-05, + "loss": 0.0362, + "step": 8660 + }, + { + "epoch": 0.5195038648211396, + "grad_norm": 0.4233555495738983, + "learning_rate": 1.7068641490369986e-05, + "loss": 0.0341, + "step": 8670 + }, + { + "epoch": 0.5201030618970579, + "grad_norm": 0.7783017158508301, + "learning_rate": 1.7061438081750144e-05, + "loss": 0.0331, + "step": 8680 + }, + { + "epoch": 0.5207022589729762, + "grad_norm": 0.718287467956543, + "learning_rate": 1.7054227559695093e-05, + "loss": 0.0385, + "step": 8690 + }, + { + "epoch": 0.5213014560488944, + "grad_norm": 0.5477543473243713, + "learning_rate": 1.704700993266678e-05, + "loss": 0.0308, + "step": 8700 + }, + { + "epoch": 0.5219006531248127, + "grad_norm": 0.5601311326026917, + "learning_rate": 1.7039785209135496e-05, + "loss": 0.0384, + "step": 8710 + }, + { + "epoch": 0.522499850200731, + "grad_norm": 0.4944303631782532, + "learning_rate": 1.7032553397579855e-05, + "loss": 0.0334, + "step": 8720 + }, + { + "epoch": 0.5230990472766492, + "grad_norm": 0.5038384199142456, + "learning_rate": 1.7025314506486786e-05, + "loss": 0.0382, + "step": 8730 + }, + { + "epoch": 0.5236982443525675, + "grad_norm": 0.7288672924041748, + "learning_rate": 1.7018068544351536e-05, + "loss": 0.0319, + "step": 8740 + }, + { + "epoch": 0.5242974414284858, + "grad_norm": 1.0376721620559692, + "learning_rate": 1.701081551967764e-05, + "loss": 0.0374, + "step": 8750 + }, + { + "epoch": 0.524896638504404, + "grad_norm": 0.8827543258666992, + "learning_rate": 1.7003555440976934e-05, + "loss": 0.0351, + "step": 8760 + }, + { + "epoch": 0.5254958355803224, + "grad_norm": 0.4307865798473358, + "learning_rate": 1.699628831676952e-05, + "loss": 0.0321, + "step": 8770 + }, + { + "epoch": 0.5260950326562407, + "grad_norm": 0.5480561256408691, + "learning_rate": 1.6989014155583775e-05, + "loss": 0.0532, + "step": 8780 + }, + { + "epoch": 0.526694229732159, + "grad_norm": 0.9598987102508545, + "learning_rate": 1.6981732965956334e-05, + "loss": 0.0365, + "step": 8790 + }, + { + "epoch": 0.5272934268080772, + "grad_norm": 0.4162677228450775, + "learning_rate": 1.697444475643207e-05, + "loss": 0.0274, + "step": 8800 + }, + { + "epoch": 0.5278926238839955, + "grad_norm": 0.8729338049888611, + "learning_rate": 1.696714953556411e-05, + "loss": 0.0437, + "step": 8810 + }, + { + "epoch": 0.5284918209599138, + "grad_norm": 0.7729384899139404, + "learning_rate": 1.69598473119138e-05, + "loss": 0.0386, + "step": 8820 + }, + { + "epoch": 0.529091018035832, + "grad_norm": 0.6997544169425964, + "learning_rate": 1.6952538094050708e-05, + "loss": 0.0303, + "step": 8830 + }, + { + "epoch": 0.5296902151117503, + "grad_norm": 0.49331608414649963, + "learning_rate": 1.6945221890552608e-05, + "loss": 0.0333, + "step": 8840 + }, + { + "epoch": 0.5302894121876686, + "grad_norm": 0.6684675812721252, + "learning_rate": 1.693789871000547e-05, + "loss": 0.0329, + "step": 8850 + }, + { + "epoch": 0.5308886092635868, + "grad_norm": 0.5638986825942993, + "learning_rate": 1.6930568561003456e-05, + "loss": 0.035, + "step": 8860 + }, + { + "epoch": 0.5314878063395051, + "grad_norm": 0.8375849723815918, + "learning_rate": 1.6923231452148904e-05, + "loss": 0.0431, + "step": 8870 + }, + { + "epoch": 0.5320870034154234, + "grad_norm": 0.5796175599098206, + "learning_rate": 1.6915887392052316e-05, + "loss": 0.0298, + "step": 8880 + }, + { + "epoch": 0.5326862004913416, + "grad_norm": 0.5302409529685974, + "learning_rate": 1.6908536389332363e-05, + "loss": 0.032, + "step": 8890 + }, + { + "epoch": 0.5332853975672599, + "grad_norm": 0.43450990319252014, + "learning_rate": 1.6901178452615853e-05, + "loss": 0.0415, + "step": 8900 + }, + { + "epoch": 0.5338845946431782, + "grad_norm": 0.3897189795970917, + "learning_rate": 1.689381359053773e-05, + "loss": 0.0372, + "step": 8910 + }, + { + "epoch": 0.5344837917190964, + "grad_norm": 0.8202592134475708, + "learning_rate": 1.688644181174108e-05, + "loss": 0.0329, + "step": 8920 + }, + { + "epoch": 0.5350829887950147, + "grad_norm": 0.8023095726966858, + "learning_rate": 1.6879063124877083e-05, + "loss": 0.0426, + "step": 8930 + }, + { + "epoch": 0.535682185870933, + "grad_norm": 0.3732883930206299, + "learning_rate": 1.6871677538605053e-05, + "loss": 0.0326, + "step": 8940 + }, + { + "epoch": 0.5362813829468512, + "grad_norm": 0.4916521906852722, + "learning_rate": 1.6864285061592376e-05, + "loss": 0.031, + "step": 8950 + }, + { + "epoch": 0.5368805800227695, + "grad_norm": 0.46110638976097107, + "learning_rate": 1.6856885702514537e-05, + "loss": 0.037, + "step": 8960 + }, + { + "epoch": 0.5374797770986878, + "grad_norm": 0.8587718605995178, + "learning_rate": 1.6849479470055104e-05, + "loss": 0.0351, + "step": 8970 + }, + { + "epoch": 0.538078974174606, + "grad_norm": 0.7067242860794067, + "learning_rate": 1.6842066372905696e-05, + "loss": 0.036, + "step": 8980 + }, + { + "epoch": 0.5386781712505243, + "grad_norm": 0.732545793056488, + "learning_rate": 1.6834646419765994e-05, + "loss": 0.036, + "step": 8990 + }, + { + "epoch": 0.5392773683264426, + "grad_norm": 0.6573438048362732, + "learning_rate": 1.6827219619343727e-05, + "loss": 0.0392, + "step": 9000 + }, + { + "epoch": 0.5398765654023608, + "grad_norm": 0.6036579608917236, + "learning_rate": 1.681978598035467e-05, + "loss": 0.0383, + "step": 9010 + }, + { + "epoch": 0.5404757624782791, + "grad_norm": 0.5556638836860657, + "learning_rate": 1.6812345511522602e-05, + "loss": 0.0396, + "step": 9020 + }, + { + "epoch": 0.5410749595541974, + "grad_norm": 0.7848073244094849, + "learning_rate": 1.6804898221579323e-05, + "loss": 0.0333, + "step": 9030 + }, + { + "epoch": 0.5416741566301156, + "grad_norm": 0.5758033394813538, + "learning_rate": 1.6797444119264655e-05, + "loss": 0.0315, + "step": 9040 + }, + { + "epoch": 0.5422733537060339, + "grad_norm": 0.5620765686035156, + "learning_rate": 1.6789983213326393e-05, + "loss": 0.0277, + "step": 9050 + }, + { + "epoch": 0.5428725507819522, + "grad_norm": 0.38210418820381165, + "learning_rate": 1.6782515512520326e-05, + "loss": 0.0437, + "step": 9060 + }, + { + "epoch": 0.5434717478578704, + "grad_norm": 0.6145310997962952, + "learning_rate": 1.6775041025610227e-05, + "loss": 0.0368, + "step": 9070 + }, + { + "epoch": 0.5440709449337887, + "grad_norm": 0.7370103001594543, + "learning_rate": 1.6767559761367812e-05, + "loss": 0.0349, + "step": 9080 + }, + { + "epoch": 0.544670142009707, + "grad_norm": 0.942118763923645, + "learning_rate": 1.676007172857276e-05, + "loss": 0.0399, + "step": 9090 + }, + { + "epoch": 0.5452693390856252, + "grad_norm": 0.5294848680496216, + "learning_rate": 1.6752576936012705e-05, + "loss": 0.0364, + "step": 9100 + }, + { + "epoch": 0.5458685361615435, + "grad_norm": 0.5716073513031006, + "learning_rate": 1.6745075392483197e-05, + "loss": 0.0313, + "step": 9110 + }, + { + "epoch": 0.5464677332374618, + "grad_norm": 0.4549729526042938, + "learning_rate": 1.6737567106787714e-05, + "loss": 0.0423, + "step": 9120 + }, + { + "epoch": 0.54706693031338, + "grad_norm": 0.5841232538223267, + "learning_rate": 1.6730052087737654e-05, + "loss": 0.0369, + "step": 9130 + }, + { + "epoch": 0.5476661273892983, + "grad_norm": 0.3302208483219147, + "learning_rate": 1.6722530344152302e-05, + "loss": 0.032, + "step": 9140 + }, + { + "epoch": 0.5482653244652166, + "grad_norm": 0.7107377648353577, + "learning_rate": 1.6715001884858848e-05, + "loss": 0.0382, + "step": 9150 + }, + { + "epoch": 0.5488645215411349, + "grad_norm": 0.6884296536445618, + "learning_rate": 1.6707466718692354e-05, + "loss": 0.0324, + "step": 9160 + }, + { + "epoch": 0.5494637186170531, + "grad_norm": 0.6279621720314026, + "learning_rate": 1.6699924854495766e-05, + "loss": 0.0314, + "step": 9170 + }, + { + "epoch": 0.5500629156929714, + "grad_norm": 0.882046103477478, + "learning_rate": 1.669237630111987e-05, + "loss": 0.0408, + "step": 9180 + }, + { + "epoch": 0.5506621127688897, + "grad_norm": 0.8980706334114075, + "learning_rate": 1.6684821067423325e-05, + "loss": 0.0436, + "step": 9190 + }, + { + "epoch": 0.5512613098448079, + "grad_norm": 0.6433938145637512, + "learning_rate": 1.6677259162272615e-05, + "loss": 0.0395, + "step": 9200 + }, + { + "epoch": 0.5518605069207262, + "grad_norm": 0.6394492983818054, + "learning_rate": 1.6669690594542055e-05, + "loss": 0.041, + "step": 9210 + }, + { + "epoch": 0.5524597039966445, + "grad_norm": 0.8700910806655884, + "learning_rate": 1.666211537311378e-05, + "loss": 0.0333, + "step": 9220 + }, + { + "epoch": 0.5530589010725627, + "grad_norm": 0.6309515237808228, + "learning_rate": 1.665453350687773e-05, + "loss": 0.0341, + "step": 9230 + }, + { + "epoch": 0.553658098148481, + "grad_norm": 0.7955977916717529, + "learning_rate": 1.664694500473166e-05, + "loss": 0.0417, + "step": 9240 + }, + { + "epoch": 0.5542572952243993, + "grad_norm": 0.8543604016304016, + "learning_rate": 1.663934987558109e-05, + "loss": 0.042, + "step": 9250 + }, + { + "epoch": 0.5548564923003175, + "grad_norm": 0.6915370225906372, + "learning_rate": 1.6631748128339332e-05, + "loss": 0.0347, + "step": 9260 + }, + { + "epoch": 0.5554556893762358, + "grad_norm": 0.6430726647377014, + "learning_rate": 1.6624139771927453e-05, + "loss": 0.0395, + "step": 9270 + }, + { + "epoch": 0.5560548864521541, + "grad_norm": 0.3080710768699646, + "learning_rate": 1.6616524815274292e-05, + "loss": 0.0299, + "step": 9280 + }, + { + "epoch": 0.5566540835280723, + "grad_norm": 0.8261982202529907, + "learning_rate": 1.660890326731642e-05, + "loss": 0.0407, + "step": 9290 + }, + { + "epoch": 0.5572532806039906, + "grad_norm": 0.7147136330604553, + "learning_rate": 1.6601275136998148e-05, + "loss": 0.0524, + "step": 9300 + }, + { + "epoch": 0.557852477679909, + "grad_norm": 0.603560209274292, + "learning_rate": 1.6593640433271514e-05, + "loss": 0.032, + "step": 9310 + }, + { + "epoch": 0.5584516747558272, + "grad_norm": 0.4913748502731323, + "learning_rate": 1.6585999165096266e-05, + "loss": 0.0419, + "step": 9320 + }, + { + "epoch": 0.5590508718317455, + "grad_norm": 0.532796323299408, + "learning_rate": 1.6578351341439858e-05, + "loss": 0.0463, + "step": 9330 + }, + { + "epoch": 0.5596500689076638, + "grad_norm": 0.7834717631340027, + "learning_rate": 1.6570696971277437e-05, + "loss": 0.0318, + "step": 9340 + }, + { + "epoch": 0.560249265983582, + "grad_norm": 0.4865007698535919, + "learning_rate": 1.656303606359183e-05, + "loss": 0.0329, + "step": 9350 + }, + { + "epoch": 0.5608484630595003, + "grad_norm": 0.5567988753318787, + "learning_rate": 1.655536862737355e-05, + "loss": 0.0331, + "step": 9360 + }, + { + "epoch": 0.5614476601354186, + "grad_norm": 0.7487075328826904, + "learning_rate": 1.6547694671620743e-05, + "loss": 0.0408, + "step": 9370 + }, + { + "epoch": 0.5620468572113368, + "grad_norm": 0.40907222032546997, + "learning_rate": 1.654001420533924e-05, + "loss": 0.0294, + "step": 9380 + }, + { + "epoch": 0.5626460542872551, + "grad_norm": 0.7240496277809143, + "learning_rate": 1.6532327237542487e-05, + "loss": 0.0334, + "step": 9390 + }, + { + "epoch": 0.5632452513631734, + "grad_norm": 0.44733667373657227, + "learning_rate": 1.6524633777251572e-05, + "loss": 0.0378, + "step": 9400 + }, + { + "epoch": 0.5638444484390917, + "grad_norm": 0.7610008716583252, + "learning_rate": 1.6516933833495197e-05, + "loss": 0.0398, + "step": 9410 + }, + { + "epoch": 0.5644436455150099, + "grad_norm": 1.0738579034805298, + "learning_rate": 1.650922741530967e-05, + "loss": 0.0461, + "step": 9420 + }, + { + "epoch": 0.5650428425909282, + "grad_norm": 0.5492804050445557, + "learning_rate": 1.6501514531738915e-05, + "loss": 0.0367, + "step": 9430 + }, + { + "epoch": 0.5656420396668465, + "grad_norm": 0.7817861437797546, + "learning_rate": 1.6493795191834424e-05, + "loss": 0.0392, + "step": 9440 + }, + { + "epoch": 0.5662412367427647, + "grad_norm": 0.6080313324928284, + "learning_rate": 1.648606940465527e-05, + "loss": 0.0288, + "step": 9450 + }, + { + "epoch": 0.566840433818683, + "grad_norm": 0.8218061923980713, + "learning_rate": 1.6478337179268095e-05, + "loss": 0.0335, + "step": 9460 + }, + { + "epoch": 0.5674396308946013, + "grad_norm": 0.6597305536270142, + "learning_rate": 1.6470598524747098e-05, + "loss": 0.0398, + "step": 9470 + }, + { + "epoch": 0.5680388279705195, + "grad_norm": 0.6254639625549316, + "learning_rate": 1.6462853450174023e-05, + "loss": 0.0339, + "step": 9480 + }, + { + "epoch": 0.5686380250464378, + "grad_norm": 1.0747283697128296, + "learning_rate": 1.645510196463814e-05, + "loss": 0.0386, + "step": 9490 + }, + { + "epoch": 0.5692372221223561, + "grad_norm": 0.4679741859436035, + "learning_rate": 1.6447344077236257e-05, + "loss": 0.0409, + "step": 9500 + }, + { + "epoch": 0.5698364191982743, + "grad_norm": 0.7349653244018555, + "learning_rate": 1.6439579797072678e-05, + "loss": 0.0355, + "step": 9510 + }, + { + "epoch": 0.5704356162741926, + "grad_norm": 0.47712597250938416, + "learning_rate": 1.6431809133259227e-05, + "loss": 0.0524, + "step": 9520 + }, + { + "epoch": 0.5710348133501109, + "grad_norm": 0.8520345091819763, + "learning_rate": 1.642403209491521e-05, + "loss": 0.0361, + "step": 9530 + }, + { + "epoch": 0.5716340104260291, + "grad_norm": 0.6470016837120056, + "learning_rate": 1.6416248691167408e-05, + "loss": 0.0296, + "step": 9540 + }, + { + "epoch": 0.5722332075019474, + "grad_norm": 0.8512763381004333, + "learning_rate": 1.640845893115008e-05, + "loss": 0.0329, + "step": 9550 + }, + { + "epoch": 0.5728324045778657, + "grad_norm": 0.5876182913780212, + "learning_rate": 1.640066282400495e-05, + "loss": 0.0381, + "step": 9560 + }, + { + "epoch": 0.5734316016537839, + "grad_norm": 0.47419166564941406, + "learning_rate": 1.6392860378881175e-05, + "loss": 0.0348, + "step": 9570 + }, + { + "epoch": 0.5740307987297022, + "grad_norm": 0.391215056180954, + "learning_rate": 1.638505160493537e-05, + "loss": 0.0366, + "step": 9580 + }, + { + "epoch": 0.5746299958056205, + "grad_norm": 0.5373614430427551, + "learning_rate": 1.6377236511331555e-05, + "loss": 0.0373, + "step": 9590 + }, + { + "epoch": 0.5752291928815387, + "grad_norm": 0.23266319930553436, + "learning_rate": 1.6369415107241185e-05, + "loss": 0.0283, + "step": 9600 + }, + { + "epoch": 0.575828389957457, + "grad_norm": 0.8146935105323792, + "learning_rate": 1.6361587401843113e-05, + "loss": 0.0377, + "step": 9610 + }, + { + "epoch": 0.5764275870333753, + "grad_norm": 0.5002696514129639, + "learning_rate": 1.6353753404323582e-05, + "loss": 0.0296, + "step": 9620 + }, + { + "epoch": 0.5770267841092935, + "grad_norm": 0.7518969774246216, + "learning_rate": 1.634591312387623e-05, + "loss": 0.0394, + "step": 9630 + }, + { + "epoch": 0.5776259811852118, + "grad_norm": 0.44596755504608154, + "learning_rate": 1.6338066569702057e-05, + "loss": 0.0359, + "step": 9640 + }, + { + "epoch": 0.5782251782611301, + "grad_norm": 0.37095823884010315, + "learning_rate": 1.6330213751009437e-05, + "loss": 0.031, + "step": 9650 + }, + { + "epoch": 0.5788243753370483, + "grad_norm": 0.48388785123825073, + "learning_rate": 1.6322354677014087e-05, + "loss": 0.0323, + "step": 9660 + }, + { + "epoch": 0.5794235724129666, + "grad_norm": 0.4681354761123657, + "learning_rate": 1.6314489356939072e-05, + "loss": 0.0573, + "step": 9670 + }, + { + "epoch": 0.5800227694888849, + "grad_norm": 0.9335370063781738, + "learning_rate": 1.6306617800014776e-05, + "loss": 0.0397, + "step": 9680 + }, + { + "epoch": 0.5806219665648031, + "grad_norm": 0.8231816291809082, + "learning_rate": 1.6298740015478917e-05, + "loss": 0.0307, + "step": 9690 + }, + { + "epoch": 0.5812211636407214, + "grad_norm": 0.7194622755050659, + "learning_rate": 1.6290856012576508e-05, + "loss": 0.0435, + "step": 9700 + }, + { + "epoch": 0.5818203607166397, + "grad_norm": 0.468923419713974, + "learning_rate": 1.6282965800559872e-05, + "loss": 0.0364, + "step": 9710 + }, + { + "epoch": 0.5824195577925579, + "grad_norm": 0.5806415677070618, + "learning_rate": 1.6275069388688604e-05, + "loss": 0.0422, + "step": 9720 + }, + { + "epoch": 0.5830187548684762, + "grad_norm": 0.6381694078445435, + "learning_rate": 1.6267166786229595e-05, + "loss": 0.0325, + "step": 9730 + }, + { + "epoch": 0.5836179519443945, + "grad_norm": 0.6025328636169434, + "learning_rate": 1.6259258002456975e-05, + "loss": 0.0321, + "step": 9740 + }, + { + "epoch": 0.5842171490203127, + "grad_norm": 0.7287771701812744, + "learning_rate": 1.625134304665215e-05, + "loss": 0.0432, + "step": 9750 + }, + { + "epoch": 0.584816346096231, + "grad_norm": 0.7109095454216003, + "learning_rate": 1.6243421928103763e-05, + "loss": 0.0315, + "step": 9760 + }, + { + "epoch": 0.5854155431721493, + "grad_norm": 0.4904409348964691, + "learning_rate": 1.6235494656107683e-05, + "loss": 0.0317, + "step": 9770 + }, + { + "epoch": 0.5860147402480675, + "grad_norm": 0.7382795214653015, + "learning_rate": 1.6227561239967013e-05, + "loss": 0.0296, + "step": 9780 + }, + { + "epoch": 0.5866139373239858, + "grad_norm": 1.2814927101135254, + "learning_rate": 1.6219621688992046e-05, + "loss": 0.043, + "step": 9790 + }, + { + "epoch": 0.5872131343999041, + "grad_norm": 0.4594469368457794, + "learning_rate": 1.62116760125003e-05, + "loss": 0.0297, + "step": 9800 + }, + { + "epoch": 0.5878123314758223, + "grad_norm": 0.5907943844795227, + "learning_rate": 1.6203724219816462e-05, + "loss": 0.0387, + "step": 9810 + }, + { + "epoch": 0.5884115285517406, + "grad_norm": 0.623093843460083, + "learning_rate": 1.6195766320272402e-05, + "loss": 0.0314, + "step": 9820 + }, + { + "epoch": 0.5890107256276589, + "grad_norm": 0.5146417021751404, + "learning_rate": 1.6187802323207166e-05, + "loss": 0.0362, + "step": 9830 + }, + { + "epoch": 0.5896099227035773, + "grad_norm": 0.5858095288276672, + "learning_rate": 1.6179832237966945e-05, + "loss": 0.0339, + "step": 9840 + }, + { + "epoch": 0.5902091197794955, + "grad_norm": 0.4178197383880615, + "learning_rate": 1.617185607390507e-05, + "loss": 0.0445, + "step": 9850 + }, + { + "epoch": 0.5908083168554138, + "grad_norm": 0.37311851978302, + "learning_rate": 1.616387384038203e-05, + "loss": 0.0321, + "step": 9860 + }, + { + "epoch": 0.5914075139313321, + "grad_norm": 0.6305625438690186, + "learning_rate": 1.6155885546765403e-05, + "loss": 0.0376, + "step": 9870 + }, + { + "epoch": 0.5920067110072503, + "grad_norm": 0.5927552580833435, + "learning_rate": 1.6147891202429907e-05, + "loss": 0.0339, + "step": 9880 + }, + { + "epoch": 0.5926059080831686, + "grad_norm": 0.4024806022644043, + "learning_rate": 1.613989081675735e-05, + "loss": 0.0286, + "step": 9890 + }, + { + "epoch": 0.5932051051590869, + "grad_norm": 0.5766516327857971, + "learning_rate": 1.613188439913663e-05, + "loss": 0.0325, + "step": 9900 + }, + { + "epoch": 0.5938043022350051, + "grad_norm": 0.4729812443256378, + "learning_rate": 1.612387195896372e-05, + "loss": 0.0476, + "step": 9910 + }, + { + "epoch": 0.5944034993109234, + "grad_norm": 0.4650471806526184, + "learning_rate": 1.611585350564167e-05, + "loss": 0.0387, + "step": 9920 + }, + { + "epoch": 0.5950026963868417, + "grad_norm": 0.6432391405105591, + "learning_rate": 1.6107829048580573e-05, + "loss": 0.0403, + "step": 9930 + }, + { + "epoch": 0.5956018934627599, + "grad_norm": 0.6335821151733398, + "learning_rate": 1.6099798597197583e-05, + "loss": 0.0307, + "step": 9940 + }, + { + "epoch": 0.5962010905386782, + "grad_norm": 0.5947774052619934, + "learning_rate": 1.6091762160916883e-05, + "loss": 0.0374, + "step": 9950 + }, + { + "epoch": 0.5968002876145965, + "grad_norm": 0.7248526811599731, + "learning_rate": 1.6083719749169676e-05, + "loss": 0.0286, + "step": 9960 + }, + { + "epoch": 0.5973994846905147, + "grad_norm": 0.5646173357963562, + "learning_rate": 1.6075671371394183e-05, + "loss": 0.0426, + "step": 9970 + }, + { + "epoch": 0.597998681766433, + "grad_norm": 0.4240330457687378, + "learning_rate": 1.606761703703562e-05, + "loss": 0.0261, + "step": 9980 + }, + { + "epoch": 0.5985978788423513, + "grad_norm": 0.6439619064331055, + "learning_rate": 1.6059556755546197e-05, + "loss": 0.0325, + "step": 9990 + }, + { + "epoch": 0.5991970759182695, + "grad_norm": 0.5899927020072937, + "learning_rate": 1.6051490536385108e-05, + "loss": 0.0328, + "step": 10000 + }, + { + "epoch": 0.5997962729941878, + "grad_norm": 0.6412765383720398, + "learning_rate": 1.6043418389018504e-05, + "loss": 0.027, + "step": 10010 + }, + { + "epoch": 0.6003954700701061, + "grad_norm": 0.28143197298049927, + "learning_rate": 1.6035340322919505e-05, + "loss": 0.0285, + "step": 10020 + }, + { + "epoch": 0.6009946671460243, + "grad_norm": 0.2767931818962097, + "learning_rate": 1.6027256347568167e-05, + "loss": 0.0312, + "step": 10030 + }, + { + "epoch": 0.6015938642219426, + "grad_norm": 0.47175201773643494, + "learning_rate": 1.601916647245149e-05, + "loss": 0.0318, + "step": 10040 + }, + { + "epoch": 0.6021930612978609, + "grad_norm": 0.4454171359539032, + "learning_rate": 1.601107070706339e-05, + "loss": 0.0357, + "step": 10050 + }, + { + "epoch": 0.6027922583737791, + "grad_norm": 0.4573518931865692, + "learning_rate": 1.60029690609047e-05, + "loss": 0.0319, + "step": 10060 + }, + { + "epoch": 0.6033914554496974, + "grad_norm": 0.5321150422096252, + "learning_rate": 1.5994861543483147e-05, + "loss": 0.0423, + "step": 10070 + }, + { + "epoch": 0.6039906525256157, + "grad_norm": 0.27531248331069946, + "learning_rate": 1.5986748164313365e-05, + "loss": 0.0284, + "step": 10080 + }, + { + "epoch": 0.604589849601534, + "grad_norm": 0.663298487663269, + "learning_rate": 1.597862893291685e-05, + "loss": 0.0328, + "step": 10090 + }, + { + "epoch": 0.6051890466774522, + "grad_norm": 0.9017484188079834, + "learning_rate": 1.5970503858821973e-05, + "loss": 0.0328, + "step": 10100 + }, + { + "epoch": 0.6057882437533705, + "grad_norm": 0.6192354559898376, + "learning_rate": 1.5962372951563964e-05, + "loss": 0.0445, + "step": 10110 + }, + { + "epoch": 0.6063874408292887, + "grad_norm": 0.4777899980545044, + "learning_rate": 1.595423622068489e-05, + "loss": 0.0348, + "step": 10120 + }, + { + "epoch": 0.606986637905207, + "grad_norm": 0.5475958585739136, + "learning_rate": 1.594609367573366e-05, + "loss": 0.0418, + "step": 10130 + }, + { + "epoch": 0.6075858349811253, + "grad_norm": 0.524467408657074, + "learning_rate": 1.5937945326266004e-05, + "loss": 0.0301, + "step": 10140 + }, + { + "epoch": 0.6081850320570436, + "grad_norm": 0.6302708387374878, + "learning_rate": 1.592979118184447e-05, + "loss": 0.0334, + "step": 10150 + }, + { + "epoch": 0.6087842291329618, + "grad_norm": 0.41625329852104187, + "learning_rate": 1.592163125203839e-05, + "loss": 0.0353, + "step": 10160 + }, + { + "epoch": 0.6093834262088801, + "grad_norm": 0.2699313759803772, + "learning_rate": 1.5913465546423902e-05, + "loss": 0.0387, + "step": 10170 + }, + { + "epoch": 0.6099826232847984, + "grad_norm": 0.701999306678772, + "learning_rate": 1.5905294074583916e-05, + "loss": 0.0365, + "step": 10180 + }, + { + "epoch": 0.6105818203607166, + "grad_norm": 0.6053565144538879, + "learning_rate": 1.5897116846108114e-05, + "loss": 0.0343, + "step": 10190 + }, + { + "epoch": 0.6111810174366349, + "grad_norm": 0.864326000213623, + "learning_rate": 1.5888933870592918e-05, + "loss": 0.0371, + "step": 10200 + }, + { + "epoch": 0.6117802145125532, + "grad_norm": 0.7532107830047607, + "learning_rate": 1.5880745157641514e-05, + "loss": 0.0323, + "step": 10210 + }, + { + "epoch": 0.6123794115884714, + "grad_norm": 0.5603524446487427, + "learning_rate": 1.587255071686381e-05, + "loss": 0.0357, + "step": 10220 + }, + { + "epoch": 0.6129786086643897, + "grad_norm": 0.5668624639511108, + "learning_rate": 1.586435055787644e-05, + "loss": 0.0421, + "step": 10230 + }, + { + "epoch": 0.613577805740308, + "grad_norm": 0.6352995038032532, + "learning_rate": 1.585614469030275e-05, + "loss": 0.0381, + "step": 10240 + }, + { + "epoch": 0.6141770028162262, + "grad_norm": 0.7873902320861816, + "learning_rate": 1.584793312377278e-05, + "loss": 0.0293, + "step": 10250 + }, + { + "epoch": 0.6147761998921445, + "grad_norm": 0.5853860378265381, + "learning_rate": 1.583971586792325e-05, + "loss": 0.0336, + "step": 10260 + }, + { + "epoch": 0.6153753969680628, + "grad_norm": 0.525260329246521, + "learning_rate": 1.583149293239759e-05, + "loss": 0.0404, + "step": 10270 + }, + { + "epoch": 0.615974594043981, + "grad_norm": 0.4027518033981323, + "learning_rate": 1.582326432684585e-05, + "loss": 0.0334, + "step": 10280 + }, + { + "epoch": 0.6165737911198993, + "grad_norm": 0.9426722526550293, + "learning_rate": 1.5815030060924775e-05, + "loss": 0.0397, + "step": 10290 + }, + { + "epoch": 0.6171729881958176, + "grad_norm": 0.6003656983375549, + "learning_rate": 1.5806790144297723e-05, + "loss": 0.0408, + "step": 10300 + }, + { + "epoch": 0.6177721852717358, + "grad_norm": 0.643667459487915, + "learning_rate": 1.57985445866347e-05, + "loss": 0.0507, + "step": 10310 + }, + { + "epoch": 0.6183713823476541, + "grad_norm": 0.6342907547950745, + "learning_rate": 1.5790293397612323e-05, + "loss": 0.0338, + "step": 10320 + }, + { + "epoch": 0.6189705794235724, + "grad_norm": 0.4388107657432556, + "learning_rate": 1.5782036586913833e-05, + "loss": 0.0393, + "step": 10330 + }, + { + "epoch": 0.6195697764994906, + "grad_norm": 0.3304736614227295, + "learning_rate": 1.577377416422904e-05, + "loss": 0.0371, + "step": 10340 + }, + { + "epoch": 0.6201689735754089, + "grad_norm": 0.6479781866073608, + "learning_rate": 1.576550613925437e-05, + "loss": 0.0357, + "step": 10350 + }, + { + "epoch": 0.6207681706513272, + "grad_norm": 0.5461524128913879, + "learning_rate": 1.575723252169281e-05, + "loss": 0.0367, + "step": 10360 + }, + { + "epoch": 0.6213673677272455, + "grad_norm": 0.4362160563468933, + "learning_rate": 1.574895332125391e-05, + "loss": 0.0302, + "step": 10370 + }, + { + "epoch": 0.6219665648031638, + "grad_norm": 0.5188114643096924, + "learning_rate": 1.574066854765377e-05, + "loss": 0.0322, + "step": 10380 + }, + { + "epoch": 0.6225657618790821, + "grad_norm": 0.34805068373680115, + "learning_rate": 1.5732378210615032e-05, + "loss": 0.0355, + "step": 10390 + }, + { + "epoch": 0.6231649589550003, + "grad_norm": 0.5073755383491516, + "learning_rate": 1.5724082319866873e-05, + "loss": 0.0446, + "step": 10400 + }, + { + "epoch": 0.6237641560309186, + "grad_norm": 0.5647034645080566, + "learning_rate": 1.5715780885144983e-05, + "loss": 0.0386, + "step": 10410 + }, + { + "epoch": 0.6243633531068369, + "grad_norm": 0.5983169078826904, + "learning_rate": 1.570747391619155e-05, + "loss": 0.0387, + "step": 10420 + }, + { + "epoch": 0.6249625501827551, + "grad_norm": 0.4163302481174469, + "learning_rate": 1.5699161422755276e-05, + "loss": 0.0278, + "step": 10430 + }, + { + "epoch": 0.6255617472586734, + "grad_norm": 0.5769792199134827, + "learning_rate": 1.5690843414591325e-05, + "loss": 0.0335, + "step": 10440 + }, + { + "epoch": 0.6261609443345917, + "grad_norm": 0.33103784918785095, + "learning_rate": 1.5682519901461353e-05, + "loss": 0.0272, + "step": 10450 + }, + { + "epoch": 0.62676014141051, + "grad_norm": 0.6019038558006287, + "learning_rate": 1.567419089313346e-05, + "loss": 0.0286, + "step": 10460 + }, + { + "epoch": 0.6273593384864282, + "grad_norm": 0.8199634552001953, + "learning_rate": 1.56658563993822e-05, + "loss": 0.041, + "step": 10470 + }, + { + "epoch": 0.6279585355623465, + "grad_norm": 0.7426667213439941, + "learning_rate": 1.565751642998857e-05, + "loss": 0.0327, + "step": 10480 + }, + { + "epoch": 0.6285577326382648, + "grad_norm": 0.3630203306674957, + "learning_rate": 1.5649170994739983e-05, + "loss": 0.0316, + "step": 10490 + }, + { + "epoch": 0.629156929714183, + "grad_norm": 0.7804543972015381, + "learning_rate": 1.5640820103430277e-05, + "loss": 0.0369, + "step": 10500 + }, + { + "epoch": 0.6297561267901013, + "grad_norm": 0.43314239382743835, + "learning_rate": 1.5632463765859685e-05, + "loss": 0.0362, + "step": 10510 + }, + { + "epoch": 0.6303553238660196, + "grad_norm": 0.5570499897003174, + "learning_rate": 1.562410199183484e-05, + "loss": 0.0307, + "step": 10520 + }, + { + "epoch": 0.6309545209419378, + "grad_norm": 0.5796618461608887, + "learning_rate": 1.5615734791168742e-05, + "loss": 0.0312, + "step": 10530 + }, + { + "epoch": 0.6315537180178561, + "grad_norm": 0.7355082035064697, + "learning_rate": 1.5607362173680774e-05, + "loss": 0.0357, + "step": 10540 + }, + { + "epoch": 0.6321529150937744, + "grad_norm": 0.39807555079460144, + "learning_rate": 1.559898414919666e-05, + "loss": 0.0281, + "step": 10550 + }, + { + "epoch": 0.6327521121696926, + "grad_norm": 0.7723329663276672, + "learning_rate": 1.5590600727548487e-05, + "loss": 0.0314, + "step": 10560 + }, + { + "epoch": 0.6333513092456109, + "grad_norm": 0.3936077058315277, + "learning_rate": 1.558221191857467e-05, + "loss": 0.0344, + "step": 10570 + }, + { + "epoch": 0.6339505063215292, + "grad_norm": 0.6881195902824402, + "learning_rate": 1.557381773211993e-05, + "loss": 0.0343, + "step": 10580 + }, + { + "epoch": 0.6345497033974474, + "grad_norm": 0.5343065857887268, + "learning_rate": 1.556541817803533e-05, + "loss": 0.0336, + "step": 10590 + }, + { + "epoch": 0.6351489004733657, + "grad_norm": 0.6643530130386353, + "learning_rate": 1.5557013266178192e-05, + "loss": 0.032, + "step": 10600 + }, + { + "epoch": 0.635748097549284, + "grad_norm": 0.5642407536506653, + "learning_rate": 1.5548603006412172e-05, + "loss": 0.0326, + "step": 10610 + }, + { + "epoch": 0.6363472946252022, + "grad_norm": 0.6929567456245422, + "learning_rate": 1.554018740860716e-05, + "loss": 0.0351, + "step": 10620 + }, + { + "epoch": 0.6369464917011205, + "grad_norm": 0.33013442158699036, + "learning_rate": 1.5531766482639342e-05, + "loss": 0.0362, + "step": 10630 + }, + { + "epoch": 0.6375456887770388, + "grad_norm": 1.056101679801941, + "learning_rate": 1.5523340238391135e-05, + "loss": 0.0443, + "step": 10640 + }, + { + "epoch": 0.638144885852957, + "grad_norm": 0.5164589881896973, + "learning_rate": 1.5514908685751208e-05, + "loss": 0.0446, + "step": 10650 + }, + { + "epoch": 0.6387440829288753, + "grad_norm": 0.319035142660141, + "learning_rate": 1.550647183461446e-05, + "loss": 0.0367, + "step": 10660 + }, + { + "epoch": 0.6393432800047936, + "grad_norm": 0.8530817627906799, + "learning_rate": 1.5498029694882004e-05, + "loss": 0.0321, + "step": 10670 + }, + { + "epoch": 0.6399424770807118, + "grad_norm": 0.7768056392669678, + "learning_rate": 1.548958227646116e-05, + "loss": 0.0318, + "step": 10680 + }, + { + "epoch": 0.6405416741566301, + "grad_norm": 0.4015219211578369, + "learning_rate": 1.5481129589265445e-05, + "loss": 0.0263, + "step": 10690 + }, + { + "epoch": 0.6411408712325484, + "grad_norm": 0.6409371495246887, + "learning_rate": 1.547267164321456e-05, + "loss": 0.0371, + "step": 10700 + }, + { + "epoch": 0.6417400683084666, + "grad_norm": 0.5829829573631287, + "learning_rate": 1.5464208448234378e-05, + "loss": 0.0424, + "step": 10710 + }, + { + "epoch": 0.6423392653843849, + "grad_norm": 0.8098331093788147, + "learning_rate": 1.545574001425692e-05, + "loss": 0.0318, + "step": 10720 + }, + { + "epoch": 0.6429384624603032, + "grad_norm": 0.40581029653549194, + "learning_rate": 1.5447266351220372e-05, + "loss": 0.0345, + "step": 10730 + }, + { + "epoch": 0.6435376595362214, + "grad_norm": 0.5018268823623657, + "learning_rate": 1.543878746906905e-05, + "loss": 0.0338, + "step": 10740 + }, + { + "epoch": 0.6441368566121397, + "grad_norm": 0.3689005970954895, + "learning_rate": 1.5430303377753396e-05, + "loss": 0.0304, + "step": 10750 + }, + { + "epoch": 0.644736053688058, + "grad_norm": 0.4961407482624054, + "learning_rate": 1.542181408722996e-05, + "loss": 0.0349, + "step": 10760 + }, + { + "epoch": 0.6453352507639762, + "grad_norm": 0.5551972389221191, + "learning_rate": 1.5413319607461397e-05, + "loss": 0.0389, + "step": 10770 + }, + { + "epoch": 0.6459344478398945, + "grad_norm": 0.5989762544631958, + "learning_rate": 1.5404819948416452e-05, + "loss": 0.0308, + "step": 10780 + }, + { + "epoch": 0.6465336449158128, + "grad_norm": 0.33431145548820496, + "learning_rate": 1.539631512006995e-05, + "loss": 0.0291, + "step": 10790 + }, + { + "epoch": 0.647132841991731, + "grad_norm": 0.5390793085098267, + "learning_rate": 1.5387805132402785e-05, + "loss": 0.0409, + "step": 10800 + }, + { + "epoch": 0.6477320390676493, + "grad_norm": 0.6348057389259338, + "learning_rate": 1.537928999540189e-05, + "loss": 0.0299, + "step": 10810 + }, + { + "epoch": 0.6483312361435676, + "grad_norm": 0.9015149474143982, + "learning_rate": 1.5370769719060262e-05, + "loss": 0.0372, + "step": 10820 + }, + { + "epoch": 0.6489304332194858, + "grad_norm": 0.4148661494255066, + "learning_rate": 1.5362244313376922e-05, + "loss": 0.0351, + "step": 10830 + }, + { + "epoch": 0.6495296302954041, + "grad_norm": 0.48212167620658875, + "learning_rate": 1.53537137883569e-05, + "loss": 0.0369, + "step": 10840 + }, + { + "epoch": 0.6501288273713224, + "grad_norm": 0.6210904121398926, + "learning_rate": 1.5345178154011247e-05, + "loss": 0.0387, + "step": 10850 + }, + { + "epoch": 0.6507280244472406, + "grad_norm": 0.4606397747993469, + "learning_rate": 1.5336637420357003e-05, + "loss": 0.0325, + "step": 10860 + }, + { + "epoch": 0.6513272215231589, + "grad_norm": 0.597671627998352, + "learning_rate": 1.5328091597417195e-05, + "loss": 0.0264, + "step": 10870 + }, + { + "epoch": 0.6519264185990772, + "grad_norm": 0.39612457156181335, + "learning_rate": 1.5319540695220822e-05, + "loss": 0.0291, + "step": 10880 + }, + { + "epoch": 0.6525256156749955, + "grad_norm": 0.514916718006134, + "learning_rate": 1.531098472380285e-05, + "loss": 0.0327, + "step": 10890 + }, + { + "epoch": 0.6531248127509138, + "grad_norm": 0.3551333248615265, + "learning_rate": 1.5302423693204185e-05, + "loss": 0.0306, + "step": 10900 + }, + { + "epoch": 0.6537240098268321, + "grad_norm": 0.3721555173397064, + "learning_rate": 1.5293857613471664e-05, + "loss": 0.0343, + "step": 10910 + }, + { + "epoch": 0.6543232069027504, + "grad_norm": 0.3669307231903076, + "learning_rate": 1.5285286494658072e-05, + "loss": 0.0339, + "step": 10920 + }, + { + "epoch": 0.6549224039786686, + "grad_norm": 0.5142899751663208, + "learning_rate": 1.5276710346822085e-05, + "loss": 0.0388, + "step": 10930 + }, + { + "epoch": 0.6555216010545869, + "grad_norm": 0.7722563147544861, + "learning_rate": 1.52681291800283e-05, + "loss": 0.0319, + "step": 10940 + }, + { + "epoch": 0.6561207981305052, + "grad_norm": 0.5405625104904175, + "learning_rate": 1.5259543004347183e-05, + "loss": 0.025, + "step": 10950 + }, + { + "epoch": 0.6567199952064234, + "grad_norm": 0.6617732048034668, + "learning_rate": 1.5250951829855097e-05, + "loss": 0.0361, + "step": 10960 + }, + { + "epoch": 0.6573191922823417, + "grad_norm": 0.8938334584236145, + "learning_rate": 1.5242355666634257e-05, + "loss": 0.0326, + "step": 10970 + }, + { + "epoch": 0.65791838935826, + "grad_norm": 0.7913880944252014, + "learning_rate": 1.5233754524772746e-05, + "loss": 0.0325, + "step": 10980 + }, + { + "epoch": 0.6585175864341782, + "grad_norm": 0.6919751763343811, + "learning_rate": 1.5225148414364481e-05, + "loss": 0.0353, + "step": 10990 + }, + { + "epoch": 0.6591167835100965, + "grad_norm": 0.6518043279647827, + "learning_rate": 1.5216537345509212e-05, + "loss": 0.0292, + "step": 11000 + }, + { + "epoch": 0.6597159805860148, + "grad_norm": 0.8302627801895142, + "learning_rate": 1.5207921328312508e-05, + "loss": 0.0292, + "step": 11010 + }, + { + "epoch": 0.660315177661933, + "grad_norm": 0.6278629302978516, + "learning_rate": 1.5199300372885739e-05, + "loss": 0.0314, + "step": 11020 + }, + { + "epoch": 0.6609143747378513, + "grad_norm": 0.42736759781837463, + "learning_rate": 1.519067448934609e-05, + "loss": 0.0313, + "step": 11030 + }, + { + "epoch": 0.6615135718137696, + "grad_norm": 1.0469647645950317, + "learning_rate": 1.5182043687816504e-05, + "loss": 0.038, + "step": 11040 + }, + { + "epoch": 0.6621127688896878, + "grad_norm": 0.4306422173976898, + "learning_rate": 1.517340797842571e-05, + "loss": 0.0362, + "step": 11050 + }, + { + "epoch": 0.6627119659656061, + "grad_norm": 0.692587673664093, + "learning_rate": 1.5164767371308196e-05, + "loss": 0.034, + "step": 11060 + }, + { + "epoch": 0.6633111630415244, + "grad_norm": 0.8272542953491211, + "learning_rate": 1.5156121876604195e-05, + "loss": 0.0332, + "step": 11070 + }, + { + "epoch": 0.6639103601174426, + "grad_norm": 0.700703501701355, + "learning_rate": 1.5147471504459675e-05, + "loss": 0.0435, + "step": 11080 + }, + { + "epoch": 0.6645095571933609, + "grad_norm": 0.22474133968353271, + "learning_rate": 1.5138816265026333e-05, + "loss": 0.0348, + "step": 11090 + }, + { + "epoch": 0.6651087542692792, + "grad_norm": 0.47771376371383667, + "learning_rate": 1.5130156168461571e-05, + "loss": 0.0365, + "step": 11100 + }, + { + "epoch": 0.6657079513451974, + "grad_norm": 0.5043072700500488, + "learning_rate": 1.5121491224928496e-05, + "loss": 0.0336, + "step": 11110 + }, + { + "epoch": 0.6663071484211157, + "grad_norm": 0.4886966347694397, + "learning_rate": 1.5112821444595904e-05, + "loss": 0.0291, + "step": 11120 + }, + { + "epoch": 0.666906345497034, + "grad_norm": 0.3845444321632385, + "learning_rate": 1.5104146837638263e-05, + "loss": 0.0418, + "step": 11130 + }, + { + "epoch": 0.6675055425729522, + "grad_norm": 0.6324570775032043, + "learning_rate": 1.5095467414235708e-05, + "loss": 0.0357, + "step": 11140 + }, + { + "epoch": 0.6681047396488705, + "grad_norm": 0.5614244937896729, + "learning_rate": 1.5086783184574023e-05, + "loss": 0.0351, + "step": 11150 + }, + { + "epoch": 0.6687039367247888, + "grad_norm": 0.4815816879272461, + "learning_rate": 1.5078094158844638e-05, + "loss": 0.0401, + "step": 11160 + }, + { + "epoch": 0.669303133800707, + "grad_norm": 0.7729785442352295, + "learning_rate": 1.5069400347244608e-05, + "loss": 0.0357, + "step": 11170 + }, + { + "epoch": 0.6699023308766253, + "grad_norm": 0.589121401309967, + "learning_rate": 1.5060701759976608e-05, + "loss": 0.0319, + "step": 11180 + }, + { + "epoch": 0.6705015279525436, + "grad_norm": 0.5420895218849182, + "learning_rate": 1.5051998407248908e-05, + "loss": 0.0346, + "step": 11190 + }, + { + "epoch": 0.6711007250284619, + "grad_norm": 0.4504237771034241, + "learning_rate": 1.5043290299275383e-05, + "loss": 0.0279, + "step": 11200 + }, + { + "epoch": 0.6716999221043801, + "grad_norm": 0.26984909176826477, + "learning_rate": 1.5034577446275485e-05, + "loss": 0.034, + "step": 11210 + }, + { + "epoch": 0.6722991191802984, + "grad_norm": 0.6075000762939453, + "learning_rate": 1.5025859858474224e-05, + "loss": 0.0319, + "step": 11220 + }, + { + "epoch": 0.6728983162562167, + "grad_norm": 0.6065084338188171, + "learning_rate": 1.5017137546102182e-05, + "loss": 0.0383, + "step": 11230 + }, + { + "epoch": 0.6734975133321349, + "grad_norm": 0.573225736618042, + "learning_rate": 1.5008410519395473e-05, + "loss": 0.0424, + "step": 11240 + }, + { + "epoch": 0.6740967104080532, + "grad_norm": 0.8821173906326294, + "learning_rate": 1.4999678788595752e-05, + "loss": 0.0409, + "step": 11250 + }, + { + "epoch": 0.6746959074839715, + "grad_norm": 0.4947790205478668, + "learning_rate": 1.4990942363950192e-05, + "loss": 0.0472, + "step": 11260 + }, + { + "epoch": 0.6752951045598897, + "grad_norm": 0.748337984085083, + "learning_rate": 1.4982201255711472e-05, + "loss": 0.0384, + "step": 11270 + }, + { + "epoch": 0.675894301635808, + "grad_norm": 0.6375566124916077, + "learning_rate": 1.4973455474137773e-05, + "loss": 0.0373, + "step": 11280 + }, + { + "epoch": 0.6764934987117263, + "grad_norm": 0.6218035221099854, + "learning_rate": 1.496470502949275e-05, + "loss": 0.0343, + "step": 11290 + }, + { + "epoch": 0.6770926957876445, + "grad_norm": 0.4296681880950928, + "learning_rate": 1.4955949932045544e-05, + "loss": 0.0317, + "step": 11300 + }, + { + "epoch": 0.6776918928635628, + "grad_norm": 0.3609360158443451, + "learning_rate": 1.4947190192070746e-05, + "loss": 0.0348, + "step": 11310 + }, + { + "epoch": 0.6782910899394811, + "grad_norm": 0.49597665667533875, + "learning_rate": 1.4938425819848403e-05, + "loss": 0.034, + "step": 11320 + }, + { + "epoch": 0.6788902870153993, + "grad_norm": 0.4339931309223175, + "learning_rate": 1.4929656825663985e-05, + "loss": 0.0351, + "step": 11330 + }, + { + "epoch": 0.6794894840913176, + "grad_norm": 0.44051092863082886, + "learning_rate": 1.4920883219808404e-05, + "loss": 0.0391, + "step": 11340 + }, + { + "epoch": 0.6800886811672359, + "grad_norm": 0.41610655188560486, + "learning_rate": 1.4912105012577971e-05, + "loss": 0.0345, + "step": 11350 + }, + { + "epoch": 0.6806878782431541, + "grad_norm": 0.6215106844902039, + "learning_rate": 1.4903322214274403e-05, + "loss": 0.0439, + "step": 11360 + }, + { + "epoch": 0.6812870753190724, + "grad_norm": 0.6418285965919495, + "learning_rate": 1.4894534835204802e-05, + "loss": 0.0289, + "step": 11370 + }, + { + "epoch": 0.6818862723949907, + "grad_norm": 0.6148926019668579, + "learning_rate": 1.4885742885681645e-05, + "loss": 0.0396, + "step": 11380 + }, + { + "epoch": 0.6824854694709089, + "grad_norm": 0.8690620064735413, + "learning_rate": 1.4876946376022778e-05, + "loss": 0.0371, + "step": 11390 + }, + { + "epoch": 0.6830846665468272, + "grad_norm": 0.4794996678829193, + "learning_rate": 1.486814531655139e-05, + "loss": 0.0317, + "step": 11400 + }, + { + "epoch": 0.6836838636227455, + "grad_norm": 0.7622746229171753, + "learning_rate": 1.4859339717596023e-05, + "loss": 0.0396, + "step": 11410 + }, + { + "epoch": 0.6842830606986637, + "grad_norm": 1.0384955406188965, + "learning_rate": 1.4850529589490527e-05, + "loss": 0.0352, + "step": 11420 + }, + { + "epoch": 0.6848822577745821, + "grad_norm": 0.33424243330955505, + "learning_rate": 1.484171494257409e-05, + "loss": 0.0272, + "step": 11430 + }, + { + "epoch": 0.6854814548505004, + "grad_norm": 0.5626234412193298, + "learning_rate": 1.4832895787191182e-05, + "loss": 0.0267, + "step": 11440 + }, + { + "epoch": 0.6860806519264186, + "grad_norm": 0.31714314222335815, + "learning_rate": 1.4824072133691573e-05, + "loss": 0.0297, + "step": 11450 + }, + { + "epoch": 0.6866798490023369, + "grad_norm": 0.8281066417694092, + "learning_rate": 1.481524399243032e-05, + "loss": 0.0337, + "step": 11460 + }, + { + "epoch": 0.6872790460782552, + "grad_norm": 0.6054716110229492, + "learning_rate": 1.4806411373767728e-05, + "loss": 0.0336, + "step": 11470 + }, + { + "epoch": 0.6878782431541735, + "grad_norm": 0.5764144659042358, + "learning_rate": 1.4797574288069379e-05, + "loss": 0.0296, + "step": 11480 + }, + { + "epoch": 0.6884774402300917, + "grad_norm": 0.4696876108646393, + "learning_rate": 1.478873274570607e-05, + "loss": 0.0318, + "step": 11490 + }, + { + "epoch": 0.68907663730601, + "grad_norm": 0.5324695110321045, + "learning_rate": 1.4779886757053861e-05, + "loss": 0.0294, + "step": 11500 + }, + { + "epoch": 0.6896758343819283, + "grad_norm": 0.2989593744277954, + "learning_rate": 1.4771036332494e-05, + "loss": 0.0275, + "step": 11510 + }, + { + "epoch": 0.6902750314578465, + "grad_norm": 0.6373855471611023, + "learning_rate": 1.4762181482412957e-05, + "loss": 0.0334, + "step": 11520 + }, + { + "epoch": 0.6908742285337648, + "grad_norm": 0.5332064032554626, + "learning_rate": 1.4753322217202389e-05, + "loss": 0.0333, + "step": 11530 + }, + { + "epoch": 0.6914734256096831, + "grad_norm": 0.4900652766227722, + "learning_rate": 1.474445854725914e-05, + "loss": 0.0294, + "step": 11540 + }, + { + "epoch": 0.6920726226856013, + "grad_norm": 0.6812027096748352, + "learning_rate": 1.4735590482985218e-05, + "loss": 0.0321, + "step": 11550 + }, + { + "epoch": 0.6926718197615196, + "grad_norm": 0.6765509843826294, + "learning_rate": 1.4726718034787799e-05, + "loss": 0.0329, + "step": 11560 + }, + { + "epoch": 0.6932710168374379, + "grad_norm": 0.5016193389892578, + "learning_rate": 1.4717841213079182e-05, + "loss": 0.034, + "step": 11570 + }, + { + "epoch": 0.6938702139133561, + "grad_norm": 0.5259473919868469, + "learning_rate": 1.4708960028276823e-05, + "loss": 0.0341, + "step": 11580 + }, + { + "epoch": 0.6944694109892744, + "grad_norm": 0.4551076292991638, + "learning_rate": 1.4700074490803277e-05, + "loss": 0.0289, + "step": 11590 + }, + { + "epoch": 0.6950686080651927, + "grad_norm": 0.5946309566497803, + "learning_rate": 1.4691184611086226e-05, + "loss": 0.0367, + "step": 11600 + }, + { + "epoch": 0.6956678051411109, + "grad_norm": 0.8045580387115479, + "learning_rate": 1.4682290399558436e-05, + "loss": 0.0292, + "step": 11610 + }, + { + "epoch": 0.6962670022170292, + "grad_norm": 1.089473843574524, + "learning_rate": 1.4673391866657755e-05, + "loss": 0.0433, + "step": 11620 + }, + { + "epoch": 0.6968661992929475, + "grad_norm": 0.7314861416816711, + "learning_rate": 1.4664489022827118e-05, + "loss": 0.0344, + "step": 11630 + }, + { + "epoch": 0.6974653963688657, + "grad_norm": 0.3244793713092804, + "learning_rate": 1.4655581878514493e-05, + "loss": 0.0329, + "step": 11640 + }, + { + "epoch": 0.698064593444784, + "grad_norm": 0.9454575181007385, + "learning_rate": 1.4646670444172925e-05, + "loss": 0.041, + "step": 11650 + }, + { + "epoch": 0.6986637905207023, + "grad_norm": 0.4321480393409729, + "learning_rate": 1.4637754730260467e-05, + "loss": 0.0338, + "step": 11660 + }, + { + "epoch": 0.6992629875966205, + "grad_norm": 0.7338399887084961, + "learning_rate": 1.4628834747240216e-05, + "loss": 0.0317, + "step": 11670 + }, + { + "epoch": 0.6998621846725388, + "grad_norm": 0.5811594724655151, + "learning_rate": 1.4619910505580257e-05, + "loss": 0.0299, + "step": 11680 + }, + { + "epoch": 0.7004613817484571, + "grad_norm": 1.1259782314300537, + "learning_rate": 1.4610982015753693e-05, + "loss": 0.0402, + "step": 11690 + }, + { + "epoch": 0.7010605788243753, + "grad_norm": 0.4460951089859009, + "learning_rate": 1.4602049288238604e-05, + "loss": 0.0279, + "step": 11700 + }, + { + "epoch": 0.7016597759002936, + "grad_norm": 0.4996945858001709, + "learning_rate": 1.4593112333518041e-05, + "loss": 0.0331, + "step": 11710 + }, + { + "epoch": 0.7022589729762119, + "grad_norm": 0.6428117156028748, + "learning_rate": 1.4584171162080018e-05, + "loss": 0.0339, + "step": 11720 + }, + { + "epoch": 0.7028581700521301, + "grad_norm": 0.7815113663673401, + "learning_rate": 1.45752257844175e-05, + "loss": 0.0333, + "step": 11730 + }, + { + "epoch": 0.7034573671280484, + "grad_norm": 0.46364331245422363, + "learning_rate": 1.4566276211028385e-05, + "loss": 0.0321, + "step": 11740 + }, + { + "epoch": 0.7040565642039667, + "grad_norm": 0.6084109544754028, + "learning_rate": 1.4557322452415492e-05, + "loss": 0.0347, + "step": 11750 + }, + { + "epoch": 0.7046557612798849, + "grad_norm": 0.5775942206382751, + "learning_rate": 1.454836451908656e-05, + "loss": 0.0339, + "step": 11760 + }, + { + "epoch": 0.7052549583558032, + "grad_norm": 0.4764224886894226, + "learning_rate": 1.4539402421554222e-05, + "loss": 0.0326, + "step": 11770 + }, + { + "epoch": 0.7058541554317215, + "grad_norm": 0.49608105421066284, + "learning_rate": 1.4530436170335997e-05, + "loss": 0.033, + "step": 11780 + }, + { + "epoch": 0.7064533525076397, + "grad_norm": 0.40599140524864197, + "learning_rate": 1.4521465775954285e-05, + "loss": 0.0323, + "step": 11790 + }, + { + "epoch": 0.707052549583558, + "grad_norm": 0.44920462369918823, + "learning_rate": 1.4512491248936336e-05, + "loss": 0.0348, + "step": 11800 + }, + { + "epoch": 0.7076517466594763, + "grad_norm": 0.393081396818161, + "learning_rate": 1.4503512599814263e-05, + "loss": 0.0329, + "step": 11810 + }, + { + "epoch": 0.7082509437353945, + "grad_norm": 0.5393109917640686, + "learning_rate": 1.4494529839125005e-05, + "loss": 0.0332, + "step": 11820 + }, + { + "epoch": 0.7088501408113128, + "grad_norm": 0.49641427397727966, + "learning_rate": 1.4485542977410341e-05, + "loss": 0.0341, + "step": 11830 + }, + { + "epoch": 0.7094493378872311, + "grad_norm": 0.4762181341648102, + "learning_rate": 1.4476552025216845e-05, + "loss": 0.0293, + "step": 11840 + }, + { + "epoch": 0.7100485349631493, + "grad_norm": 0.7498350143432617, + "learning_rate": 1.4467556993095908e-05, + "loss": 0.0338, + "step": 11850 + }, + { + "epoch": 0.7106477320390676, + "grad_norm": 0.5212231874465942, + "learning_rate": 1.4458557891603692e-05, + "loss": 0.0336, + "step": 11860 + }, + { + "epoch": 0.7112469291149859, + "grad_norm": 0.3803718388080597, + "learning_rate": 1.4449554731301152e-05, + "loss": 0.0336, + "step": 11870 + }, + { + "epoch": 0.7118461261909041, + "grad_norm": 0.3723069429397583, + "learning_rate": 1.4440547522753993e-05, + "loss": 0.0313, + "step": 11880 + }, + { + "epoch": 0.7124453232668224, + "grad_norm": 0.6411343216896057, + "learning_rate": 1.443153627653268e-05, + "loss": 0.0298, + "step": 11890 + }, + { + "epoch": 0.7130445203427407, + "grad_norm": 0.7487270832061768, + "learning_rate": 1.4422521003212413e-05, + "loss": 0.0334, + "step": 11900 + }, + { + "epoch": 0.713643717418659, + "grad_norm": 0.4146348237991333, + "learning_rate": 1.4413501713373111e-05, + "loss": 0.0362, + "step": 11910 + }, + { + "epoch": 0.7142429144945772, + "grad_norm": 0.6354920864105225, + "learning_rate": 1.4404478417599417e-05, + "loss": 0.0345, + "step": 11920 + }, + { + "epoch": 0.7148421115704955, + "grad_norm": 0.8422425985336304, + "learning_rate": 1.4395451126480673e-05, + "loss": 0.0379, + "step": 11930 + }, + { + "epoch": 0.7154413086464138, + "grad_norm": 0.6452838182449341, + "learning_rate": 1.4386419850610905e-05, + "loss": 0.0317, + "step": 11940 + }, + { + "epoch": 0.716040505722332, + "grad_norm": 0.6057304739952087, + "learning_rate": 1.437738460058882e-05, + "loss": 0.0349, + "step": 11950 + }, + { + "epoch": 0.7166397027982504, + "grad_norm": 0.4880058467388153, + "learning_rate": 1.4368345387017784e-05, + "loss": 0.0283, + "step": 11960 + }, + { + "epoch": 0.7172388998741687, + "grad_norm": 0.6094764471054077, + "learning_rate": 1.435930222050582e-05, + "loss": 0.0424, + "step": 11970 + }, + { + "epoch": 0.7178380969500869, + "grad_norm": 0.552979588508606, + "learning_rate": 1.4350255111665589e-05, + "loss": 0.0318, + "step": 11980 + }, + { + "epoch": 0.7184372940260052, + "grad_norm": 0.5134180188179016, + "learning_rate": 1.4341204071114374e-05, + "loss": 0.0267, + "step": 11990 + }, + { + "epoch": 0.7190364911019235, + "grad_norm": 0.3264164626598358, + "learning_rate": 1.4332149109474079e-05, + "loss": 0.0347, + "step": 12000 + }, + { + "epoch": 0.7196356881778417, + "grad_norm": 0.6406404972076416, + "learning_rate": 1.4323090237371205e-05, + "loss": 0.0326, + "step": 12010 + }, + { + "epoch": 0.72023488525376, + "grad_norm": 0.4818336069583893, + "learning_rate": 1.4314027465436835e-05, + "loss": 0.0357, + "step": 12020 + }, + { + "epoch": 0.7208340823296783, + "grad_norm": 0.4660695791244507, + "learning_rate": 1.430496080430665e-05, + "loss": 0.0348, + "step": 12030 + }, + { + "epoch": 0.7214332794055965, + "grad_norm": 0.527518093585968, + "learning_rate": 1.4295890264620873e-05, + "loss": 0.0293, + "step": 12040 + }, + { + "epoch": 0.7220324764815148, + "grad_norm": 0.5105645656585693, + "learning_rate": 1.4286815857024292e-05, + "loss": 0.0299, + "step": 12050 + }, + { + "epoch": 0.7226316735574331, + "grad_norm": 0.5807327628135681, + "learning_rate": 1.4277737592166226e-05, + "loss": 0.0348, + "step": 12060 + }, + { + "epoch": 0.7232308706333513, + "grad_norm": 0.34552720189094543, + "learning_rate": 1.4268655480700526e-05, + "loss": 0.0281, + "step": 12070 + }, + { + "epoch": 0.7238300677092696, + "grad_norm": 0.6902264952659607, + "learning_rate": 1.4259569533285555e-05, + "loss": 0.0345, + "step": 12080 + }, + { + "epoch": 0.7244292647851879, + "grad_norm": 0.7842390537261963, + "learning_rate": 1.425047976058418e-05, + "loss": 0.0392, + "step": 12090 + }, + { + "epoch": 0.7250284618611061, + "grad_norm": 0.37371599674224854, + "learning_rate": 1.4241386173263754e-05, + "loss": 0.0307, + "step": 12100 + }, + { + "epoch": 0.7256276589370244, + "grad_norm": 0.4447094798088074, + "learning_rate": 1.4232288781996109e-05, + "loss": 0.0343, + "step": 12110 + }, + { + "epoch": 0.7262268560129427, + "grad_norm": 0.5179654359817505, + "learning_rate": 1.4223187597457541e-05, + "loss": 0.0328, + "step": 12120 + }, + { + "epoch": 0.726826053088861, + "grad_norm": 0.34313148260116577, + "learning_rate": 1.4214082630328794e-05, + "loss": 0.0327, + "step": 12130 + }, + { + "epoch": 0.7274252501647792, + "grad_norm": 0.5038807988166809, + "learning_rate": 1.420497389129506e-05, + "loss": 0.0398, + "step": 12140 + }, + { + "epoch": 0.7280244472406975, + "grad_norm": 0.5751231908798218, + "learning_rate": 1.4195861391045944e-05, + "loss": 0.0365, + "step": 12150 + }, + { + "epoch": 0.7286236443166157, + "grad_norm": 0.23205915093421936, + "learning_rate": 1.418674514027548e-05, + "loss": 0.0338, + "step": 12160 + }, + { + "epoch": 0.729222841392534, + "grad_norm": 0.3348182141780853, + "learning_rate": 1.4177625149682092e-05, + "loss": 0.0264, + "step": 12170 + }, + { + "epoch": 0.7298220384684523, + "grad_norm": 0.432725727558136, + "learning_rate": 1.4168501429968596e-05, + "loss": 0.0377, + "step": 12180 + }, + { + "epoch": 0.7304212355443706, + "grad_norm": 0.5504162907600403, + "learning_rate": 1.415937399184219e-05, + "loss": 0.0334, + "step": 12190 + }, + { + "epoch": 0.7310204326202888, + "grad_norm": 0.7994229793548584, + "learning_rate": 1.4150242846014421e-05, + "loss": 0.0369, + "step": 12200 + }, + { + "epoch": 0.7316196296962071, + "grad_norm": 0.7374292016029358, + "learning_rate": 1.414110800320121e-05, + "loss": 0.0305, + "step": 12210 + }, + { + "epoch": 0.7322188267721254, + "grad_norm": 0.786674976348877, + "learning_rate": 1.4131969474122797e-05, + "loss": 0.0283, + "step": 12220 + }, + { + "epoch": 0.7328180238480436, + "grad_norm": 0.39285191893577576, + "learning_rate": 1.4122827269503756e-05, + "loss": 0.028, + "step": 12230 + }, + { + "epoch": 0.7334172209239619, + "grad_norm": 0.49710261821746826, + "learning_rate": 1.4113681400072971e-05, + "loss": 0.0285, + "step": 12240 + }, + { + "epoch": 0.7340164179998802, + "grad_norm": 0.2925172448158264, + "learning_rate": 1.4104531876563635e-05, + "loss": 0.0353, + "step": 12250 + }, + { + "epoch": 0.7346156150757984, + "grad_norm": 0.5930903553962708, + "learning_rate": 1.4095378709713218e-05, + "loss": 0.0265, + "step": 12260 + }, + { + "epoch": 0.7352148121517167, + "grad_norm": 0.5205737352371216, + "learning_rate": 1.408622191026347e-05, + "loss": 0.0349, + "step": 12270 + }, + { + "epoch": 0.735814009227635, + "grad_norm": 0.5042659044265747, + "learning_rate": 1.4077061488960414e-05, + "loss": 0.0376, + "step": 12280 + }, + { + "epoch": 0.7364132063035532, + "grad_norm": 0.6537132263183594, + "learning_rate": 1.4067897456554302e-05, + "loss": 0.0402, + "step": 12290 + }, + { + "epoch": 0.7370124033794715, + "grad_norm": 0.5453435182571411, + "learning_rate": 1.4058729823799649e-05, + "loss": 0.0344, + "step": 12300 + }, + { + "epoch": 0.7376116004553898, + "grad_norm": 0.7153663635253906, + "learning_rate": 1.4049558601455175e-05, + "loss": 0.0365, + "step": 12310 + }, + { + "epoch": 0.738210797531308, + "grad_norm": 0.4821360409259796, + "learning_rate": 1.4040383800283824e-05, + "loss": 0.0359, + "step": 12320 + }, + { + "epoch": 0.7388099946072263, + "grad_norm": 0.389950156211853, + "learning_rate": 1.403120543105273e-05, + "loss": 0.031, + "step": 12330 + }, + { + "epoch": 0.7394091916831446, + "grad_norm": 0.6750137805938721, + "learning_rate": 1.4022023504533227e-05, + "loss": 0.0353, + "step": 12340 + }, + { + "epoch": 0.7400083887590628, + "grad_norm": 0.5380377173423767, + "learning_rate": 1.4012838031500814e-05, + "loss": 0.0329, + "step": 12350 + }, + { + "epoch": 0.7406075858349811, + "grad_norm": 0.45814576745033264, + "learning_rate": 1.4003649022735159e-05, + "loss": 0.0312, + "step": 12360 + }, + { + "epoch": 0.7412067829108994, + "grad_norm": 0.6910536289215088, + "learning_rate": 1.3994456489020072e-05, + "loss": 0.0349, + "step": 12370 + }, + { + "epoch": 0.7418059799868176, + "grad_norm": 0.49182868003845215, + "learning_rate": 1.3985260441143504e-05, + "loss": 0.0377, + "step": 12380 + }, + { + "epoch": 0.7424051770627359, + "grad_norm": 0.41329771280288696, + "learning_rate": 1.397606088989753e-05, + "loss": 0.0383, + "step": 12390 + }, + { + "epoch": 0.7430043741386542, + "grad_norm": 0.47242429852485657, + "learning_rate": 1.3966857846078337e-05, + "loss": 0.0313, + "step": 12400 + }, + { + "epoch": 0.7436035712145724, + "grad_norm": 0.45115360617637634, + "learning_rate": 1.3957651320486206e-05, + "loss": 0.0294, + "step": 12410 + }, + { + "epoch": 0.7442027682904907, + "grad_norm": 0.44364428520202637, + "learning_rate": 1.394844132392551e-05, + "loss": 0.0328, + "step": 12420 + }, + { + "epoch": 0.744801965366409, + "grad_norm": 0.4205247461795807, + "learning_rate": 1.3939227867204692e-05, + "loss": 0.0282, + "step": 12430 + }, + { + "epoch": 0.7454011624423272, + "grad_norm": 1.0961225032806396, + "learning_rate": 1.3930010961136255e-05, + "loss": 0.0274, + "step": 12440 + }, + { + "epoch": 0.7460003595182455, + "grad_norm": 0.6065059304237366, + "learning_rate": 1.3920790616536761e-05, + "loss": 0.0327, + "step": 12450 + }, + { + "epoch": 0.7465995565941638, + "grad_norm": 0.3095875084400177, + "learning_rate": 1.3911566844226784e-05, + "loss": 0.0348, + "step": 12460 + }, + { + "epoch": 0.747198753670082, + "grad_norm": 0.8527400493621826, + "learning_rate": 1.3902339655030945e-05, + "loss": 0.0285, + "step": 12470 + }, + { + "epoch": 0.7477979507460003, + "grad_norm": 0.4449825882911682, + "learning_rate": 1.3893109059777858e-05, + "loss": 0.0435, + "step": 12480 + }, + { + "epoch": 0.7483971478219186, + "grad_norm": 1.1708461046218872, + "learning_rate": 1.3883875069300146e-05, + "loss": 0.0312, + "step": 12490 + }, + { + "epoch": 0.748996344897837, + "grad_norm": 0.6145966053009033, + "learning_rate": 1.387463769443441e-05, + "loss": 0.0283, + "step": 12500 + }, + { + "epoch": 0.7495955419737552, + "grad_norm": 0.5100684762001038, + "learning_rate": 1.3865396946021219e-05, + "loss": 0.0331, + "step": 12510 + }, + { + "epoch": 0.7501947390496735, + "grad_norm": 0.37704023718833923, + "learning_rate": 1.3856152834905113e-05, + "loss": 0.0327, + "step": 12520 + }, + { + "epoch": 0.7507939361255918, + "grad_norm": 0.6774486899375916, + "learning_rate": 1.3846905371934564e-05, + "loss": 0.0347, + "step": 12530 + }, + { + "epoch": 0.75139313320151, + "grad_norm": 0.4984931945800781, + "learning_rate": 1.3837654567961995e-05, + "loss": 0.0303, + "step": 12540 + }, + { + "epoch": 0.7519923302774283, + "grad_norm": 0.6189061403274536, + "learning_rate": 1.3828400433843728e-05, + "loss": 0.0316, + "step": 12550 + }, + { + "epoch": 0.7525915273533466, + "grad_norm": 0.4665672183036804, + "learning_rate": 1.3819142980440012e-05, + "loss": 0.038, + "step": 12560 + }, + { + "epoch": 0.7531907244292648, + "grad_norm": 0.898800790309906, + "learning_rate": 1.3809882218614981e-05, + "loss": 0.0292, + "step": 12570 + }, + { + "epoch": 0.7537899215051831, + "grad_norm": 0.5205129384994507, + "learning_rate": 1.3800618159236658e-05, + "loss": 0.0322, + "step": 12580 + }, + { + "epoch": 0.7543891185811014, + "grad_norm": 0.588542640209198, + "learning_rate": 1.3791350813176932e-05, + "loss": 0.0307, + "step": 12590 + }, + { + "epoch": 0.7549883156570196, + "grad_norm": 0.620620846748352, + "learning_rate": 1.3782080191311546e-05, + "loss": 0.035, + "step": 12600 + }, + { + "epoch": 0.7555875127329379, + "grad_norm": 0.639234185218811, + "learning_rate": 1.3772806304520097e-05, + "loss": 0.0296, + "step": 12610 + }, + { + "epoch": 0.7561867098088562, + "grad_norm": 0.38672956824302673, + "learning_rate": 1.3763529163686002e-05, + "loss": 0.0355, + "step": 12620 + }, + { + "epoch": 0.7567859068847744, + "grad_norm": 0.5244165062904358, + "learning_rate": 1.3754248779696509e-05, + "loss": 0.0305, + "step": 12630 + }, + { + "epoch": 0.7573851039606927, + "grad_norm": 0.8960945010185242, + "learning_rate": 1.374496516344266e-05, + "loss": 0.0323, + "step": 12640 + }, + { + "epoch": 0.757984301036611, + "grad_norm": 0.3789278566837311, + "learning_rate": 1.3735678325819295e-05, + "loss": 0.031, + "step": 12650 + }, + { + "epoch": 0.7585834981125292, + "grad_norm": 0.5104514956474304, + "learning_rate": 1.3726388277725041e-05, + "loss": 0.0405, + "step": 12660 + }, + { + "epoch": 0.7591826951884475, + "grad_norm": 0.5860878825187683, + "learning_rate": 1.3717095030062283e-05, + "loss": 0.0376, + "step": 12670 + }, + { + "epoch": 0.7597818922643658, + "grad_norm": 0.9913963079452515, + "learning_rate": 1.3707798593737162e-05, + "loss": 0.0386, + "step": 12680 + }, + { + "epoch": 0.760381089340284, + "grad_norm": 0.4112319350242615, + "learning_rate": 1.3698498979659571e-05, + "loss": 0.0276, + "step": 12690 + }, + { + "epoch": 0.7609802864162023, + "grad_norm": 0.703815221786499, + "learning_rate": 1.3689196198743122e-05, + "loss": 0.0303, + "step": 12700 + }, + { + "epoch": 0.7615794834921206, + "grad_norm": 0.7342479825019836, + "learning_rate": 1.3679890261905142e-05, + "loss": 0.0303, + "step": 12710 + }, + { + "epoch": 0.7621786805680388, + "grad_norm": 0.46025165915489197, + "learning_rate": 1.3670581180066672e-05, + "loss": 0.0324, + "step": 12720 + }, + { + "epoch": 0.7627778776439571, + "grad_norm": 0.3976695239543915, + "learning_rate": 1.3661268964152438e-05, + "loss": 0.0255, + "step": 12730 + }, + { + "epoch": 0.7633770747198754, + "grad_norm": 0.4137699604034424, + "learning_rate": 1.365195362509084e-05, + "loss": 0.0298, + "step": 12740 + }, + { + "epoch": 0.7639762717957936, + "grad_norm": 0.6333696842193604, + "learning_rate": 1.3642635173813949e-05, + "loss": 0.0438, + "step": 12750 + }, + { + "epoch": 0.7645754688717119, + "grad_norm": 0.5179958343505859, + "learning_rate": 1.3633313621257488e-05, + "loss": 0.0268, + "step": 12760 + }, + { + "epoch": 0.7651746659476302, + "grad_norm": 0.5947912335395813, + "learning_rate": 1.362398897836082e-05, + "loss": 0.0266, + "step": 12770 + }, + { + "epoch": 0.7657738630235484, + "grad_norm": 0.7916423678398132, + "learning_rate": 1.3614661256066925e-05, + "loss": 0.0363, + "step": 12780 + }, + { + "epoch": 0.7663730600994667, + "grad_norm": 0.7686305046081543, + "learning_rate": 1.3605330465322415e-05, + "loss": 0.0338, + "step": 12790 + }, + { + "epoch": 0.766972257175385, + "grad_norm": 0.5727254152297974, + "learning_rate": 1.3595996617077482e-05, + "loss": 0.0275, + "step": 12800 + }, + { + "epoch": 0.7675714542513032, + "grad_norm": 0.8913756012916565, + "learning_rate": 1.3586659722285927e-05, + "loss": 0.0365, + "step": 12810 + }, + { + "epoch": 0.7681706513272215, + "grad_norm": 0.45855259895324707, + "learning_rate": 1.3577319791905109e-05, + "loss": 0.0401, + "step": 12820 + }, + { + "epoch": 0.7687698484031398, + "grad_norm": 0.8214496374130249, + "learning_rate": 1.3567976836895962e-05, + "loss": 0.0371, + "step": 12830 + }, + { + "epoch": 0.769369045479058, + "grad_norm": 0.5001949667930603, + "learning_rate": 1.3558630868222955e-05, + "loss": 0.033, + "step": 12840 + }, + { + "epoch": 0.7699682425549763, + "grad_norm": 0.6546716094017029, + "learning_rate": 1.3549281896854115e-05, + "loss": 0.0422, + "step": 12850 + }, + { + "epoch": 0.7705674396308946, + "grad_norm": 0.35789239406585693, + "learning_rate": 1.3539929933760967e-05, + "loss": 0.0323, + "step": 12860 + }, + { + "epoch": 0.7711666367068128, + "grad_norm": 0.7539666891098022, + "learning_rate": 1.3530574989918572e-05, + "loss": 0.0316, + "step": 12870 + }, + { + "epoch": 0.7717658337827311, + "grad_norm": 0.422543466091156, + "learning_rate": 1.3521217076305478e-05, + "loss": 0.0388, + "step": 12880 + }, + { + "epoch": 0.7723650308586494, + "grad_norm": 0.5595449805259705, + "learning_rate": 1.3511856203903705e-05, + "loss": 0.0351, + "step": 12890 + }, + { + "epoch": 0.7729642279345676, + "grad_norm": 0.3847978115081787, + "learning_rate": 1.3502492383698772e-05, + "loss": 0.0285, + "step": 12900 + }, + { + "epoch": 0.7735634250104859, + "grad_norm": 0.4276559352874756, + "learning_rate": 1.3493125626679638e-05, + "loss": 0.0292, + "step": 12910 + }, + { + "epoch": 0.7741626220864042, + "grad_norm": 0.5125867128372192, + "learning_rate": 1.3483755943838715e-05, + "loss": 0.0351, + "step": 12920 + }, + { + "epoch": 0.7747618191623225, + "grad_norm": 0.7208243012428284, + "learning_rate": 1.3474383346171845e-05, + "loss": 0.0293, + "step": 12930 + }, + { + "epoch": 0.7753610162382407, + "grad_norm": 0.5181360244750977, + "learning_rate": 1.3465007844678295e-05, + "loss": 0.0316, + "step": 12940 + }, + { + "epoch": 0.775960213314159, + "grad_norm": 0.3499206304550171, + "learning_rate": 1.3455629450360738e-05, + "loss": 0.0281, + "step": 12950 + }, + { + "epoch": 0.7765594103900773, + "grad_norm": 0.26258599758148193, + "learning_rate": 1.3446248174225244e-05, + "loss": 0.027, + "step": 12960 + }, + { + "epoch": 0.7771586074659955, + "grad_norm": 0.7002774477005005, + "learning_rate": 1.3436864027281264e-05, + "loss": 0.031, + "step": 12970 + }, + { + "epoch": 0.7777578045419138, + "grad_norm": 0.5419202446937561, + "learning_rate": 1.3427477020541613e-05, + "loss": 0.0384, + "step": 12980 + }, + { + "epoch": 0.7783570016178321, + "grad_norm": 0.3112017512321472, + "learning_rate": 1.341808716502247e-05, + "loss": 0.0234, + "step": 12990 + }, + { + "epoch": 0.7789561986937503, + "grad_norm": 0.6459445357322693, + "learning_rate": 1.3408694471743346e-05, + "loss": 0.0302, + "step": 13000 + }, + { + "epoch": 0.7795553957696686, + "grad_norm": 0.5128807425498962, + "learning_rate": 1.3399298951727102e-05, + "loss": 0.0385, + "step": 13010 + }, + { + "epoch": 0.7801545928455869, + "grad_norm": 0.41403454542160034, + "learning_rate": 1.3389900615999895e-05, + "loss": 0.0321, + "step": 13020 + }, + { + "epoch": 0.7807537899215052, + "grad_norm": 0.4647153615951538, + "learning_rate": 1.33804994755912e-05, + "loss": 0.0358, + "step": 13030 + }, + { + "epoch": 0.7813529869974235, + "grad_norm": 0.29951611161231995, + "learning_rate": 1.3371095541533772e-05, + "loss": 0.0288, + "step": 13040 + }, + { + "epoch": 0.7819521840733418, + "grad_norm": 0.3440749943256378, + "learning_rate": 1.336168882486366e-05, + "loss": 0.0274, + "step": 13050 + }, + { + "epoch": 0.78255138114926, + "grad_norm": 0.413753867149353, + "learning_rate": 1.3352279336620167e-05, + "loss": 0.0276, + "step": 13060 + }, + { + "epoch": 0.7831505782251783, + "grad_norm": 0.29087361693382263, + "learning_rate": 1.3342867087845848e-05, + "loss": 0.03, + "step": 13070 + }, + { + "epoch": 0.7837497753010966, + "grad_norm": 0.7001593708992004, + "learning_rate": 1.3333452089586505e-05, + "loss": 0.0277, + "step": 13080 + }, + { + "epoch": 0.7843489723770148, + "grad_norm": 0.47245970368385315, + "learning_rate": 1.3324034352891162e-05, + "loss": 0.0426, + "step": 13090 + }, + { + "epoch": 0.7849481694529331, + "grad_norm": 0.5747501850128174, + "learning_rate": 1.3314613888812058e-05, + "loss": 0.0337, + "step": 13100 + }, + { + "epoch": 0.7855473665288514, + "grad_norm": 0.42420580983161926, + "learning_rate": 1.3305190708404633e-05, + "loss": 0.0407, + "step": 13110 + }, + { + "epoch": 0.7861465636047696, + "grad_norm": 0.2931080162525177, + "learning_rate": 1.3295764822727512e-05, + "loss": 0.0344, + "step": 13120 + }, + { + "epoch": 0.7867457606806879, + "grad_norm": 0.8410253524780273, + "learning_rate": 1.3286336242842496e-05, + "loss": 0.0385, + "step": 13130 + }, + { + "epoch": 0.7873449577566062, + "grad_norm": 0.27601751685142517, + "learning_rate": 1.3276904979814551e-05, + "loss": 0.0304, + "step": 13140 + }, + { + "epoch": 0.7879441548325244, + "grad_norm": 0.5673372745513916, + "learning_rate": 1.3267471044711788e-05, + "loss": 0.0261, + "step": 13150 + }, + { + "epoch": 0.7885433519084427, + "grad_norm": 0.5385505557060242, + "learning_rate": 1.325803444860546e-05, + "loss": 0.0296, + "step": 13160 + }, + { + "epoch": 0.789142548984361, + "grad_norm": 0.4159039556980133, + "learning_rate": 1.3248595202569932e-05, + "loss": 0.0343, + "step": 13170 + }, + { + "epoch": 0.7897417460602792, + "grad_norm": 1.0409079790115356, + "learning_rate": 1.3239153317682687e-05, + "loss": 0.0325, + "step": 13180 + }, + { + "epoch": 0.7903409431361975, + "grad_norm": 0.5017931461334229, + "learning_rate": 1.3229708805024304e-05, + "loss": 0.0311, + "step": 13190 + }, + { + "epoch": 0.7909401402121158, + "grad_norm": 0.45170727372169495, + "learning_rate": 1.3220261675678442e-05, + "loss": 0.0302, + "step": 13200 + }, + { + "epoch": 0.791539337288034, + "grad_norm": 0.7260886430740356, + "learning_rate": 1.3210811940731841e-05, + "loss": 0.0353, + "step": 13210 + }, + { + "epoch": 0.7921385343639523, + "grad_norm": 0.7251535058021545, + "learning_rate": 1.3201359611274277e-05, + "loss": 0.0329, + "step": 13220 + }, + { + "epoch": 0.7927377314398706, + "grad_norm": 0.21863135695457458, + "learning_rate": 1.3191904698398602e-05, + "loss": 0.0354, + "step": 13230 + }, + { + "epoch": 0.7933369285157889, + "grad_norm": 0.5168152451515198, + "learning_rate": 1.3182447213200666e-05, + "loss": 0.0268, + "step": 13240 + }, + { + "epoch": 0.7939361255917071, + "grad_norm": 0.509765088558197, + "learning_rate": 1.317298716677937e-05, + "loss": 0.0321, + "step": 13250 + }, + { + "epoch": 0.7945353226676254, + "grad_norm": 0.4227997958660126, + "learning_rate": 1.3163524570236596e-05, + "loss": 0.031, + "step": 13260 + }, + { + "epoch": 0.7951345197435437, + "grad_norm": 0.5740527510643005, + "learning_rate": 1.3154059434677232e-05, + "loss": 0.0351, + "step": 13270 + }, + { + "epoch": 0.7957337168194619, + "grad_norm": 0.5497387647628784, + "learning_rate": 1.3144591771209141e-05, + "loss": 0.0277, + "step": 13280 + }, + { + "epoch": 0.7963329138953802, + "grad_norm": 0.3965212106704712, + "learning_rate": 1.3135121590943149e-05, + "loss": 0.028, + "step": 13290 + }, + { + "epoch": 0.7969321109712985, + "grad_norm": 0.43198928236961365, + "learning_rate": 1.3125648904993052e-05, + "loss": 0.0421, + "step": 13300 + }, + { + "epoch": 0.7975313080472167, + "grad_norm": 0.42254316806793213, + "learning_rate": 1.311617372447556e-05, + "loss": 0.0335, + "step": 13310 + }, + { + "epoch": 0.798130505123135, + "grad_norm": 0.3395012617111206, + "learning_rate": 1.3106696060510333e-05, + "loss": 0.0309, + "step": 13320 + }, + { + "epoch": 0.7987297021990533, + "grad_norm": 0.6258816719055176, + "learning_rate": 1.3097215924219934e-05, + "loss": 0.0287, + "step": 13330 + }, + { + "epoch": 0.7993288992749715, + "grad_norm": 0.7914189100265503, + "learning_rate": 1.308773332672984e-05, + "loss": 0.0263, + "step": 13340 + }, + { + "epoch": 0.7999280963508898, + "grad_norm": 0.4104739725589752, + "learning_rate": 1.3078248279168394e-05, + "loss": 0.0282, + "step": 13350 + }, + { + "epoch": 0.8005272934268081, + "grad_norm": 0.47704172134399414, + "learning_rate": 1.3068760792666839e-05, + "loss": 0.0358, + "step": 13360 + }, + { + "epoch": 0.8011264905027263, + "grad_norm": 0.7908433675765991, + "learning_rate": 1.305927087835926e-05, + "loss": 0.0341, + "step": 13370 + }, + { + "epoch": 0.8017256875786446, + "grad_norm": 0.7039026021957397, + "learning_rate": 1.3049778547382608e-05, + "loss": 0.0369, + "step": 13380 + }, + { + "epoch": 0.8023248846545629, + "grad_norm": 0.4095489978790283, + "learning_rate": 1.3040283810876658e-05, + "loss": 0.047, + "step": 13390 + }, + { + "epoch": 0.8029240817304811, + "grad_norm": 0.6500707864761353, + "learning_rate": 1.3030786679984007e-05, + "loss": 0.0285, + "step": 13400 + }, + { + "epoch": 0.8035232788063994, + "grad_norm": 0.3794250190258026, + "learning_rate": 1.3021287165850079e-05, + "loss": 0.0293, + "step": 13410 + }, + { + "epoch": 0.8041224758823177, + "grad_norm": 0.3065261244773865, + "learning_rate": 1.3011785279623073e-05, + "loss": 0.031, + "step": 13420 + }, + { + "epoch": 0.8047216729582359, + "grad_norm": 0.3773103654384613, + "learning_rate": 1.3002281032453985e-05, + "loss": 0.0303, + "step": 13430 + }, + { + "epoch": 0.8053208700341542, + "grad_norm": 0.602186918258667, + "learning_rate": 1.299277443549658e-05, + "loss": 0.0398, + "step": 13440 + }, + { + "epoch": 0.8059200671100725, + "grad_norm": 0.5309048891067505, + "learning_rate": 1.2983265499907377e-05, + "loss": 0.0251, + "step": 13450 + }, + { + "epoch": 0.8065192641859907, + "grad_norm": 0.9474682211875916, + "learning_rate": 1.2973754236845642e-05, + "loss": 0.0345, + "step": 13460 + }, + { + "epoch": 0.807118461261909, + "grad_norm": 0.7786683440208435, + "learning_rate": 1.2964240657473372e-05, + "loss": 0.0289, + "step": 13470 + }, + { + "epoch": 0.8077176583378273, + "grad_norm": 0.6320096850395203, + "learning_rate": 1.2954724772955285e-05, + "loss": 0.0326, + "step": 13480 + }, + { + "epoch": 0.8083168554137455, + "grad_norm": 0.7034086585044861, + "learning_rate": 1.2945206594458794e-05, + "loss": 0.0332, + "step": 13490 + }, + { + "epoch": 0.8089160524896638, + "grad_norm": 0.5060988664627075, + "learning_rate": 1.2935686133154022e-05, + "loss": 0.0337, + "step": 13500 + }, + { + "epoch": 0.8095152495655821, + "grad_norm": 0.7484520673751831, + "learning_rate": 1.292616340021375e-05, + "loss": 0.0317, + "step": 13510 + }, + { + "epoch": 0.8101144466415003, + "grad_norm": 0.6556681394577026, + "learning_rate": 1.2916638406813446e-05, + "loss": 0.0349, + "step": 13520 + }, + { + "epoch": 0.8107136437174186, + "grad_norm": 0.41952699422836304, + "learning_rate": 1.2907111164131215e-05, + "loss": 0.0318, + "step": 13530 + }, + { + "epoch": 0.8113128407933369, + "grad_norm": 0.4678110182285309, + "learning_rate": 1.2897581683347806e-05, + "loss": 0.0328, + "step": 13540 + }, + { + "epoch": 0.8119120378692551, + "grad_norm": 0.35579657554626465, + "learning_rate": 1.2888049975646593e-05, + "loss": 0.0346, + "step": 13550 + }, + { + "epoch": 0.8125112349451735, + "grad_norm": 0.5984554290771484, + "learning_rate": 1.2878516052213575e-05, + "loss": 0.0277, + "step": 13560 + }, + { + "epoch": 0.8131104320210918, + "grad_norm": 0.41169118881225586, + "learning_rate": 1.286897992423733e-05, + "loss": 0.0288, + "step": 13570 + }, + { + "epoch": 0.8137096290970101, + "grad_norm": 0.5163332223892212, + "learning_rate": 1.285944160290905e-05, + "loss": 0.027, + "step": 13580 + }, + { + "epoch": 0.8143088261729283, + "grad_norm": 0.780305802822113, + "learning_rate": 1.284990109942247e-05, + "loss": 0.0249, + "step": 13590 + }, + { + "epoch": 0.8149080232488466, + "grad_norm": 0.4293205142021179, + "learning_rate": 1.2840358424973916e-05, + "loss": 0.0302, + "step": 13600 + }, + { + "epoch": 0.8155072203247649, + "grad_norm": 0.650065004825592, + "learning_rate": 1.2830813590762241e-05, + "loss": 0.0349, + "step": 13610 + }, + { + "epoch": 0.8161064174006831, + "grad_norm": 0.3155161142349243, + "learning_rate": 1.282126660798884e-05, + "loss": 0.0333, + "step": 13620 + }, + { + "epoch": 0.8167056144766014, + "grad_norm": 0.5841111540794373, + "learning_rate": 1.2811717487857633e-05, + "loss": 0.0371, + "step": 13630 + }, + { + "epoch": 0.8173048115525197, + "grad_norm": 0.3873291015625, + "learning_rate": 1.280216624157504e-05, + "loss": 0.0304, + "step": 13640 + }, + { + "epoch": 0.8179040086284379, + "grad_norm": 0.39657002687454224, + "learning_rate": 1.2792612880349982e-05, + "loss": 0.0279, + "step": 13650 + }, + { + "epoch": 0.8185032057043562, + "grad_norm": 0.6305680871009827, + "learning_rate": 1.278305741539386e-05, + "loss": 0.0293, + "step": 13660 + }, + { + "epoch": 0.8191024027802745, + "grad_norm": 0.5810249447822571, + "learning_rate": 1.2773499857920546e-05, + "loss": 0.0317, + "step": 13670 + }, + { + "epoch": 0.8197015998561927, + "grad_norm": 0.6288999319076538, + "learning_rate": 1.2763940219146367e-05, + "loss": 0.0283, + "step": 13680 + }, + { + "epoch": 0.820300796932111, + "grad_norm": 0.5402754545211792, + "learning_rate": 1.2754378510290087e-05, + "loss": 0.0258, + "step": 13690 + }, + { + "epoch": 0.8208999940080293, + "grad_norm": 1.3184820413589478, + "learning_rate": 1.2744814742572906e-05, + "loss": 0.0398, + "step": 13700 + }, + { + "epoch": 0.8214991910839475, + "grad_norm": 0.9564218521118164, + "learning_rate": 1.2735248927218437e-05, + "loss": 0.0301, + "step": 13710 + }, + { + "epoch": 0.8220983881598658, + "grad_norm": 0.8810652494430542, + "learning_rate": 1.2725681075452708e-05, + "loss": 0.0376, + "step": 13720 + }, + { + "epoch": 0.8226975852357841, + "grad_norm": 0.4254887104034424, + "learning_rate": 1.2716111198504106e-05, + "loss": 0.0336, + "step": 13730 + }, + { + "epoch": 0.8232967823117023, + "grad_norm": 0.45076319575309753, + "learning_rate": 1.270653930760343e-05, + "loss": 0.0266, + "step": 13740 + }, + { + "epoch": 0.8238959793876206, + "grad_norm": 0.6057546138763428, + "learning_rate": 1.2696965413983819e-05, + "loss": 0.0292, + "step": 13750 + }, + { + "epoch": 0.8244951764635389, + "grad_norm": 0.4007343649864197, + "learning_rate": 1.268738952888078e-05, + "loss": 0.0352, + "step": 13760 + }, + { + "epoch": 0.8250943735394571, + "grad_norm": 0.4183088541030884, + "learning_rate": 1.267781166353214e-05, + "loss": 0.0265, + "step": 13770 + }, + { + "epoch": 0.8256935706153754, + "grad_norm": 0.368300199508667, + "learning_rate": 1.2668231829178055e-05, + "loss": 0.0326, + "step": 13780 + }, + { + "epoch": 0.8262927676912937, + "grad_norm": 0.4838104844093323, + "learning_rate": 1.2658650037061003e-05, + "loss": 0.0262, + "step": 13790 + }, + { + "epoch": 0.8268919647672119, + "grad_norm": 0.5136057138442993, + "learning_rate": 1.2649066298425741e-05, + "loss": 0.0299, + "step": 13800 + }, + { + "epoch": 0.8274911618431302, + "grad_norm": 0.5161435604095459, + "learning_rate": 1.2639480624519328e-05, + "loss": 0.0339, + "step": 13810 + }, + { + "epoch": 0.8280903589190485, + "grad_norm": 0.6350359320640564, + "learning_rate": 1.2629893026591083e-05, + "loss": 0.0361, + "step": 13820 + }, + { + "epoch": 0.8286895559949667, + "grad_norm": 0.5247905254364014, + "learning_rate": 1.2620303515892587e-05, + "loss": 0.0259, + "step": 13830 + }, + { + "epoch": 0.829288753070885, + "grad_norm": 0.5668240785598755, + "learning_rate": 1.2610712103677662e-05, + "loss": 0.0324, + "step": 13840 + }, + { + "epoch": 0.8298879501468033, + "grad_norm": 0.48688119649887085, + "learning_rate": 1.2601118801202369e-05, + "loss": 0.0395, + "step": 13850 + }, + { + "epoch": 0.8304871472227215, + "grad_norm": 0.8496071100234985, + "learning_rate": 1.259152361972498e-05, + "loss": 0.0326, + "step": 13860 + }, + { + "epoch": 0.8310863442986398, + "grad_norm": 0.7072296142578125, + "learning_rate": 1.2581926570505975e-05, + "loss": 0.0307, + "step": 13870 + }, + { + "epoch": 0.8316855413745581, + "grad_norm": 0.7262448072433472, + "learning_rate": 1.257232766480803e-05, + "loss": 0.0376, + "step": 13880 + }, + { + "epoch": 0.8322847384504763, + "grad_norm": 0.5265096426010132, + "learning_rate": 1.2562726913895987e-05, + "loss": 0.0331, + "step": 13890 + }, + { + "epoch": 0.8328839355263946, + "grad_norm": 0.7246168851852417, + "learning_rate": 1.255312432903687e-05, + "loss": 0.0286, + "step": 13900 + }, + { + "epoch": 0.8334831326023129, + "grad_norm": 0.4539868235588074, + "learning_rate": 1.2543519921499843e-05, + "loss": 0.036, + "step": 13910 + }, + { + "epoch": 0.8340823296782311, + "grad_norm": 0.36881664395332336, + "learning_rate": 1.2533913702556216e-05, + "loss": 0.0302, + "step": 13920 + }, + { + "epoch": 0.8346815267541494, + "grad_norm": 0.37113773822784424, + "learning_rate": 1.2524305683479418e-05, + "loss": 0.0278, + "step": 13930 + }, + { + "epoch": 0.8352807238300677, + "grad_norm": 0.537762463092804, + "learning_rate": 1.2514695875544995e-05, + "loss": 0.0325, + "step": 13940 + }, + { + "epoch": 0.835879920905986, + "grad_norm": 0.6519997715950012, + "learning_rate": 1.2505084290030598e-05, + "loss": 0.0309, + "step": 13950 + }, + { + "epoch": 0.8364791179819042, + "grad_norm": 0.31448549032211304, + "learning_rate": 1.249547093821595e-05, + "loss": 0.0245, + "step": 13960 + }, + { + "epoch": 0.8370783150578225, + "grad_norm": 0.43815988302230835, + "learning_rate": 1.2485855831382862e-05, + "loss": 0.0398, + "step": 13970 + }, + { + "epoch": 0.8376775121337408, + "grad_norm": 0.525791585445404, + "learning_rate": 1.2476238980815193e-05, + "loss": 0.0261, + "step": 13980 + }, + { + "epoch": 0.838276709209659, + "grad_norm": 0.4887944757938385, + "learning_rate": 1.2466620397798852e-05, + "loss": 0.025, + "step": 13990 + }, + { + "epoch": 0.8388759062855773, + "grad_norm": 0.5287007689476013, + "learning_rate": 1.2457000093621785e-05, + "loss": 0.0278, + "step": 14000 + }, + { + "epoch": 0.8394751033614956, + "grad_norm": 0.7277513146400452, + "learning_rate": 1.2447378079573953e-05, + "loss": 0.0304, + "step": 14010 + }, + { + "epoch": 0.8400743004374138, + "grad_norm": 0.6415050029754639, + "learning_rate": 1.2437754366947326e-05, + "loss": 0.0292, + "step": 14020 + }, + { + "epoch": 0.8406734975133321, + "grad_norm": 0.48691895604133606, + "learning_rate": 1.2428128967035866e-05, + "loss": 0.0337, + "step": 14030 + }, + { + "epoch": 0.8412726945892504, + "grad_norm": 0.53068608045578, + "learning_rate": 1.2418501891135514e-05, + "loss": 0.0338, + "step": 14040 + }, + { + "epoch": 0.8418718916651686, + "grad_norm": 0.5464624762535095, + "learning_rate": 1.2408873150544187e-05, + "loss": 0.0303, + "step": 14050 + }, + { + "epoch": 0.8424710887410869, + "grad_norm": 0.3911614418029785, + "learning_rate": 1.2399242756561744e-05, + "loss": 0.0345, + "step": 14060 + }, + { + "epoch": 0.8430702858170052, + "grad_norm": 0.6894099116325378, + "learning_rate": 1.2389610720489986e-05, + "loss": 0.0365, + "step": 14070 + }, + { + "epoch": 0.8436694828929234, + "grad_norm": 0.5268317461013794, + "learning_rate": 1.2379977053632646e-05, + "loss": 0.0405, + "step": 14080 + }, + { + "epoch": 0.8442686799688418, + "grad_norm": 0.8635499477386475, + "learning_rate": 1.237034176729537e-05, + "loss": 0.0321, + "step": 14090 + }, + { + "epoch": 0.8448678770447601, + "grad_norm": 0.21542859077453613, + "learning_rate": 1.2360704872785704e-05, + "loss": 0.0264, + "step": 14100 + }, + { + "epoch": 0.8454670741206783, + "grad_norm": 0.6257337331771851, + "learning_rate": 1.2351066381413078e-05, + "loss": 0.0355, + "step": 14110 + }, + { + "epoch": 0.8460662711965966, + "grad_norm": 0.6525475978851318, + "learning_rate": 1.2341426304488798e-05, + "loss": 0.0304, + "step": 14120 + }, + { + "epoch": 0.8466654682725149, + "grad_norm": 0.4599299430847168, + "learning_rate": 1.2331784653326032e-05, + "loss": 0.0314, + "step": 14130 + }, + { + "epoch": 0.8472646653484331, + "grad_norm": 0.7497361898422241, + "learning_rate": 1.2322141439239794e-05, + "loss": 0.031, + "step": 14140 + }, + { + "epoch": 0.8478638624243514, + "grad_norm": 0.3124896287918091, + "learning_rate": 1.2312496673546937e-05, + "loss": 0.0257, + "step": 14150 + }, + { + "epoch": 0.8484630595002697, + "grad_norm": 0.6170748472213745, + "learning_rate": 1.2302850367566126e-05, + "loss": 0.0323, + "step": 14160 + }, + { + "epoch": 0.849062256576188, + "grad_norm": 0.4619428515434265, + "learning_rate": 1.229320253261784e-05, + "loss": 0.0315, + "step": 14170 + }, + { + "epoch": 0.8496614536521062, + "grad_norm": 0.5088011026382446, + "learning_rate": 1.2283553180024351e-05, + "loss": 0.0255, + "step": 14180 + }, + { + "epoch": 0.8502606507280245, + "grad_norm": 0.5397948622703552, + "learning_rate": 1.2273902321109714e-05, + "loss": 0.0265, + "step": 14190 + }, + { + "epoch": 0.8508598478039427, + "grad_norm": 0.457082062959671, + "learning_rate": 1.2264249967199744e-05, + "loss": 0.0279, + "step": 14200 + }, + { + "epoch": 0.851459044879861, + "grad_norm": 0.4131294786930084, + "learning_rate": 1.2254596129622021e-05, + "loss": 0.0269, + "step": 14210 + }, + { + "epoch": 0.8520582419557793, + "grad_norm": 1.1949660778045654, + "learning_rate": 1.2244940819705855e-05, + "loss": 0.042, + "step": 14220 + }, + { + "epoch": 0.8526574390316976, + "grad_norm": 0.6057063341140747, + "learning_rate": 1.2235284048782297e-05, + "loss": 0.0306, + "step": 14230 + }, + { + "epoch": 0.8532566361076158, + "grad_norm": 0.26918280124664307, + "learning_rate": 1.2225625828184105e-05, + "loss": 0.0283, + "step": 14240 + }, + { + "epoch": 0.8538558331835341, + "grad_norm": 0.48841091990470886, + "learning_rate": 1.2215966169245734e-05, + "loss": 0.0323, + "step": 14250 + }, + { + "epoch": 0.8544550302594524, + "grad_norm": 0.6195886135101318, + "learning_rate": 1.2206305083303331e-05, + "loss": 0.0295, + "step": 14260 + }, + { + "epoch": 0.8550542273353706, + "grad_norm": 0.5798623561859131, + "learning_rate": 1.2196642581694726e-05, + "loss": 0.031, + "step": 14270 + }, + { + "epoch": 0.8556534244112889, + "grad_norm": 0.4877539277076721, + "learning_rate": 1.2186978675759396e-05, + "loss": 0.0267, + "step": 14280 + }, + { + "epoch": 0.8562526214872072, + "grad_norm": 0.33261221647262573, + "learning_rate": 1.2177313376838478e-05, + "loss": 0.0261, + "step": 14290 + }, + { + "epoch": 0.8568518185631254, + "grad_norm": 0.8361077904701233, + "learning_rate": 1.2167646696274734e-05, + "loss": 0.0311, + "step": 14300 + }, + { + "epoch": 0.8574510156390437, + "grad_norm": 0.305922269821167, + "learning_rate": 1.2157978645412556e-05, + "loss": 0.0302, + "step": 14310 + }, + { + "epoch": 0.858050212714962, + "grad_norm": 0.22662357985973358, + "learning_rate": 1.2148309235597937e-05, + "loss": 0.028, + "step": 14320 + }, + { + "epoch": 0.8586494097908802, + "grad_norm": 0.4273515045642853, + "learning_rate": 1.2138638478178471e-05, + "loss": 0.0307, + "step": 14330 + }, + { + "epoch": 0.8592486068667985, + "grad_norm": 0.521216869354248, + "learning_rate": 1.2128966384503328e-05, + "loss": 0.0277, + "step": 14340 + }, + { + "epoch": 0.8598478039427168, + "grad_norm": 0.7090896368026733, + "learning_rate": 1.2119292965923246e-05, + "loss": 0.0346, + "step": 14350 + }, + { + "epoch": 0.860447001018635, + "grad_norm": 0.3693661391735077, + "learning_rate": 1.210961823379053e-05, + "loss": 0.0305, + "step": 14360 + }, + { + "epoch": 0.8610461980945533, + "grad_norm": 0.3651321530342102, + "learning_rate": 1.2099942199459006e-05, + "loss": 0.0263, + "step": 14370 + }, + { + "epoch": 0.8616453951704716, + "grad_norm": 0.5577923655509949, + "learning_rate": 1.2090264874284049e-05, + "loss": 0.0357, + "step": 14380 + }, + { + "epoch": 0.8622445922463898, + "grad_norm": 0.6504148840904236, + "learning_rate": 1.2080586269622531e-05, + "loss": 0.0404, + "step": 14390 + }, + { + "epoch": 0.8628437893223081, + "grad_norm": 0.49205282330513, + "learning_rate": 1.2070906396832835e-05, + "loss": 0.035, + "step": 14400 + }, + { + "epoch": 0.8634429863982264, + "grad_norm": 0.6053458452224731, + "learning_rate": 1.2061225267274837e-05, + "loss": 0.0328, + "step": 14410 + }, + { + "epoch": 0.8640421834741446, + "grad_norm": 0.5949649214744568, + "learning_rate": 1.2051542892309875e-05, + "loss": 0.0302, + "step": 14420 + }, + { + "epoch": 0.8646413805500629, + "grad_norm": 0.5310356020927429, + "learning_rate": 1.2041859283300762e-05, + "loss": 0.0264, + "step": 14430 + }, + { + "epoch": 0.8652405776259812, + "grad_norm": 0.4087911546230316, + "learning_rate": 1.2032174451611744e-05, + "loss": 0.0273, + "step": 14440 + }, + { + "epoch": 0.8658397747018994, + "grad_norm": 0.35929426550865173, + "learning_rate": 1.2022488408608519e-05, + "loss": 0.0274, + "step": 14450 + }, + { + "epoch": 0.8664389717778177, + "grad_norm": 0.5112904906272888, + "learning_rate": 1.2012801165658191e-05, + "loss": 0.0253, + "step": 14460 + }, + { + "epoch": 0.867038168853736, + "grad_norm": 0.39148232340812683, + "learning_rate": 1.2003112734129284e-05, + "loss": 0.0305, + "step": 14470 + }, + { + "epoch": 0.8676373659296542, + "grad_norm": 0.47718697786331177, + "learning_rate": 1.1993423125391712e-05, + "loss": 0.0304, + "step": 14480 + }, + { + "epoch": 0.8682365630055725, + "grad_norm": 0.620936393737793, + "learning_rate": 1.1983732350816765e-05, + "loss": 0.0289, + "step": 14490 + }, + { + "epoch": 0.8688357600814908, + "grad_norm": 0.8953443169593811, + "learning_rate": 1.1974040421777115e-05, + "loss": 0.0328, + "step": 14500 + }, + { + "epoch": 0.869434957157409, + "grad_norm": 0.4663226902484894, + "learning_rate": 1.1964347349646773e-05, + "loss": 0.0302, + "step": 14510 + }, + { + "epoch": 0.8700341542333273, + "grad_norm": 0.707167387008667, + "learning_rate": 1.1954653145801105e-05, + "loss": 0.0319, + "step": 14520 + }, + { + "epoch": 0.8706333513092456, + "grad_norm": 0.5325813889503479, + "learning_rate": 1.1944957821616791e-05, + "loss": 0.0318, + "step": 14530 + }, + { + "epoch": 0.8712325483851638, + "grad_norm": 0.6239158511161804, + "learning_rate": 1.1935261388471843e-05, + "loss": 0.0289, + "step": 14540 + }, + { + "epoch": 0.8718317454610821, + "grad_norm": 0.38823947310447693, + "learning_rate": 1.192556385774556e-05, + "loss": 0.0266, + "step": 14550 + }, + { + "epoch": 0.8724309425370004, + "grad_norm": 0.48849165439605713, + "learning_rate": 1.1915865240818538e-05, + "loss": 0.0234, + "step": 14560 + }, + { + "epoch": 0.8730301396129186, + "grad_norm": 0.23214028775691986, + "learning_rate": 1.190616554907264e-05, + "loss": 0.0276, + "step": 14570 + }, + { + "epoch": 0.8736293366888369, + "grad_norm": 0.3467197120189667, + "learning_rate": 1.1896464793890998e-05, + "loss": 0.0282, + "step": 14580 + }, + { + "epoch": 0.8742285337647552, + "grad_norm": 0.2009357064962387, + "learning_rate": 1.188676298665799e-05, + "loss": 0.0298, + "step": 14590 + }, + { + "epoch": 0.8748277308406734, + "grad_norm": 0.8589951395988464, + "learning_rate": 1.187706013875922e-05, + "loss": 0.0264, + "step": 14600 + }, + { + "epoch": 0.8754269279165917, + "grad_norm": 0.43969056010246277, + "learning_rate": 1.1867356261581527e-05, + "loss": 0.0292, + "step": 14610 + }, + { + "epoch": 0.8760261249925101, + "grad_norm": 0.5750611424446106, + "learning_rate": 1.1857651366512953e-05, + "loss": 0.0289, + "step": 14620 + }, + { + "epoch": 0.8766253220684284, + "grad_norm": 0.5399556756019592, + "learning_rate": 1.1847945464942723e-05, + "loss": 0.0307, + "step": 14630 + }, + { + "epoch": 0.8772245191443466, + "grad_norm": 0.20517395436763763, + "learning_rate": 1.1838238568261262e-05, + "loss": 0.0249, + "step": 14640 + }, + { + "epoch": 0.8778237162202649, + "grad_norm": 0.7490189671516418, + "learning_rate": 1.1828530687860151e-05, + "loss": 0.0246, + "step": 14650 + }, + { + "epoch": 0.8784229132961832, + "grad_norm": 0.6661257743835449, + "learning_rate": 1.1818821835132133e-05, + "loss": 0.0325, + "step": 14660 + }, + { + "epoch": 0.8790221103721014, + "grad_norm": 0.571394681930542, + "learning_rate": 1.1809112021471077e-05, + "loss": 0.0342, + "step": 14670 + }, + { + "epoch": 0.8796213074480197, + "grad_norm": 0.8792482018470764, + "learning_rate": 1.1799401258272001e-05, + "loss": 0.0332, + "step": 14680 + }, + { + "epoch": 0.880220504523938, + "grad_norm": 0.5770248770713806, + "learning_rate": 1.1789689556931017e-05, + "loss": 0.0286, + "step": 14690 + }, + { + "epoch": 0.8808197015998562, + "grad_norm": 0.62962406873703, + "learning_rate": 1.1779976928845356e-05, + "loss": 0.0246, + "step": 14700 + }, + { + "epoch": 0.8814188986757745, + "grad_norm": 0.4651380479335785, + "learning_rate": 1.1770263385413325e-05, + "loss": 0.037, + "step": 14710 + }, + { + "epoch": 0.8820180957516928, + "grad_norm": 0.5087499022483826, + "learning_rate": 1.1760548938034308e-05, + "loss": 0.0265, + "step": 14720 + }, + { + "epoch": 0.882617292827611, + "grad_norm": 0.44421979784965515, + "learning_rate": 1.1750833598108746e-05, + "loss": 0.0306, + "step": 14730 + }, + { + "epoch": 0.8832164899035293, + "grad_norm": 0.6521517038345337, + "learning_rate": 1.1741117377038138e-05, + "loss": 0.0334, + "step": 14740 + }, + { + "epoch": 0.8838156869794476, + "grad_norm": 0.5384942889213562, + "learning_rate": 1.1731400286225005e-05, + "loss": 0.0296, + "step": 14750 + }, + { + "epoch": 0.8844148840553658, + "grad_norm": 0.41909387707710266, + "learning_rate": 1.1721682337072901e-05, + "loss": 0.0297, + "step": 14760 + }, + { + "epoch": 0.8850140811312841, + "grad_norm": 0.6697047352790833, + "learning_rate": 1.1711963540986377e-05, + "loss": 0.0331, + "step": 14770 + }, + { + "epoch": 0.8856132782072024, + "grad_norm": 0.4015032947063446, + "learning_rate": 1.1702243909370978e-05, + "loss": 0.0326, + "step": 14780 + }, + { + "epoch": 0.8862124752831206, + "grad_norm": 0.48070228099823, + "learning_rate": 1.169252345363324e-05, + "loss": 0.0278, + "step": 14790 + }, + { + "epoch": 0.8868116723590389, + "grad_norm": 0.8651071786880493, + "learning_rate": 1.1682802185180655e-05, + "loss": 0.0242, + "step": 14800 + }, + { + "epoch": 0.8874108694349572, + "grad_norm": 1.17703378200531, + "learning_rate": 1.1673080115421673e-05, + "loss": 0.0288, + "step": 14810 + }, + { + "epoch": 0.8880100665108754, + "grad_norm": 0.45865103602409363, + "learning_rate": 1.1663357255765684e-05, + "loss": 0.0322, + "step": 14820 + }, + { + "epoch": 0.8886092635867937, + "grad_norm": 0.41243845224380493, + "learning_rate": 1.1653633617623006e-05, + "loss": 0.0297, + "step": 14830 + }, + { + "epoch": 0.889208460662712, + "grad_norm": 0.482997864484787, + "learning_rate": 1.1643909212404869e-05, + "loss": 0.0305, + "step": 14840 + }, + { + "epoch": 0.8898076577386302, + "grad_norm": 0.5319142937660217, + "learning_rate": 1.1634184051523409e-05, + "loss": 0.0284, + "step": 14850 + }, + { + "epoch": 0.8904068548145485, + "grad_norm": 0.6116752028465271, + "learning_rate": 1.1624458146391642e-05, + "loss": 0.0311, + "step": 14860 + }, + { + "epoch": 0.8910060518904668, + "grad_norm": 0.4214901328086853, + "learning_rate": 1.1614731508423453e-05, + "loss": 0.0269, + "step": 14870 + }, + { + "epoch": 0.891605248966385, + "grad_norm": 0.6246733069419861, + "learning_rate": 1.1605004149033603e-05, + "loss": 0.026, + "step": 14880 + }, + { + "epoch": 0.8922044460423033, + "grad_norm": 0.4263368248939514, + "learning_rate": 1.159527607963768e-05, + "loss": 0.0305, + "step": 14890 + }, + { + "epoch": 0.8928036431182216, + "grad_norm": 0.4059041738510132, + "learning_rate": 1.1585547311652127e-05, + "loss": 0.022, + "step": 14900 + }, + { + "epoch": 0.8934028401941398, + "grad_norm": 0.6362516283988953, + "learning_rate": 1.1575817856494185e-05, + "loss": 0.0265, + "step": 14910 + }, + { + "epoch": 0.8940020372700581, + "grad_norm": 0.2905973494052887, + "learning_rate": 1.1566087725581918e-05, + "loss": 0.0297, + "step": 14920 + }, + { + "epoch": 0.8946012343459764, + "grad_norm": 0.42270833253860474, + "learning_rate": 1.1556356930334172e-05, + "loss": 0.0255, + "step": 14930 + }, + { + "epoch": 0.8952004314218946, + "grad_norm": 0.26410749554634094, + "learning_rate": 1.1546625482170582e-05, + "loss": 0.0252, + "step": 14940 + }, + { + "epoch": 0.8957996284978129, + "grad_norm": 0.7570974230766296, + "learning_rate": 1.153689339251154e-05, + "loss": 0.027, + "step": 14950 + }, + { + "epoch": 0.8963988255737312, + "grad_norm": 0.5941224098205566, + "learning_rate": 1.1527160672778195e-05, + "loss": 0.0295, + "step": 14960 + }, + { + "epoch": 0.8969980226496495, + "grad_norm": 0.3985750079154968, + "learning_rate": 1.1517427334392439e-05, + "loss": 0.0337, + "step": 14970 + }, + { + "epoch": 0.8975972197255677, + "grad_norm": 0.3877560496330261, + "learning_rate": 1.1507693388776885e-05, + "loss": 0.024, + "step": 14980 + }, + { + "epoch": 0.898196416801486, + "grad_norm": 0.44742006063461304, + "learning_rate": 1.1497958847354861e-05, + "loss": 0.0284, + "step": 14990 + }, + { + "epoch": 0.8987956138774043, + "grad_norm": 0.3280893564224243, + "learning_rate": 1.1488223721550394e-05, + "loss": 0.0318, + "step": 15000 + }, + { + "epoch": 0.8993948109533225, + "grad_norm": 0.5289477109909058, + "learning_rate": 1.1478488022788199e-05, + "loss": 0.0341, + "step": 15010 + }, + { + "epoch": 0.8999940080292408, + "grad_norm": 0.4976208806037903, + "learning_rate": 1.146875176249365e-05, + "loss": 0.0239, + "step": 15020 + }, + { + "epoch": 0.9005932051051591, + "grad_norm": 0.6153465509414673, + "learning_rate": 1.1459014952092803e-05, + "loss": 0.0252, + "step": 15030 + }, + { + "epoch": 0.9011924021810773, + "grad_norm": 0.6112402677536011, + "learning_rate": 1.1449277603012345e-05, + "loss": 0.0292, + "step": 15040 + }, + { + "epoch": 0.9017915992569956, + "grad_norm": 0.4973732531070709, + "learning_rate": 1.1439539726679592e-05, + "loss": 0.0307, + "step": 15050 + }, + { + "epoch": 0.9023907963329139, + "grad_norm": 0.5871816277503967, + "learning_rate": 1.1429801334522487e-05, + "loss": 0.0254, + "step": 15060 + }, + { + "epoch": 0.9029899934088321, + "grad_norm": 1.2150986194610596, + "learning_rate": 1.1420062437969575e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 0.9035891904847504, + "grad_norm": 0.6406526565551758, + "learning_rate": 1.1410323048449998e-05, + "loss": 0.0265, + "step": 15080 + }, + { + "epoch": 0.9041883875606687, + "grad_norm": 0.4251798093318939, + "learning_rate": 1.1400583177393467e-05, + "loss": 0.0269, + "step": 15090 + }, + { + "epoch": 0.9047875846365869, + "grad_norm": 0.4702431857585907, + "learning_rate": 1.1390842836230268e-05, + "loss": 0.0311, + "step": 15100 + }, + { + "epoch": 0.9053867817125052, + "grad_norm": 0.3235304355621338, + "learning_rate": 1.1381102036391223e-05, + "loss": 0.0236, + "step": 15110 + }, + { + "epoch": 0.9059859787884235, + "grad_norm": 0.4913889467716217, + "learning_rate": 1.1371360789307718e-05, + "loss": 0.0231, + "step": 15120 + }, + { + "epoch": 0.9065851758643417, + "grad_norm": 0.4980977177619934, + "learning_rate": 1.1361619106411641e-05, + "loss": 0.0289, + "step": 15130 + }, + { + "epoch": 0.90718437294026, + "grad_norm": 0.740922212600708, + "learning_rate": 1.13518769991354e-05, + "loss": 0.0334, + "step": 15140 + }, + { + "epoch": 0.9077835700161784, + "grad_norm": 0.3305300772190094, + "learning_rate": 1.1342134478911897e-05, + "loss": 0.0301, + "step": 15150 + }, + { + "epoch": 0.9083827670920966, + "grad_norm": 0.7037357091903687, + "learning_rate": 1.1332391557174531e-05, + "loss": 0.0311, + "step": 15160 + }, + { + "epoch": 0.9089819641680149, + "grad_norm": 0.44783756136894226, + "learning_rate": 1.1322648245357156e-05, + "loss": 0.0339, + "step": 15170 + }, + { + "epoch": 0.9095811612439332, + "grad_norm": 0.7776843309402466, + "learning_rate": 1.1312904554894095e-05, + "loss": 0.0349, + "step": 15180 + }, + { + "epoch": 0.9101803583198514, + "grad_norm": 0.49181437492370605, + "learning_rate": 1.130316049722011e-05, + "loss": 0.0285, + "step": 15190 + }, + { + "epoch": 0.9107795553957697, + "grad_norm": 0.333814799785614, + "learning_rate": 1.1293416083770394e-05, + "loss": 0.0284, + "step": 15200 + }, + { + "epoch": 0.911378752471688, + "grad_norm": 1.203652262687683, + "learning_rate": 1.1283671325980563e-05, + "loss": 0.0365, + "step": 15210 + }, + { + "epoch": 0.9119779495476062, + "grad_norm": 0.521643877029419, + "learning_rate": 1.1273926235286627e-05, + "loss": 0.0313, + "step": 15220 + }, + { + "epoch": 0.9125771466235245, + "grad_norm": 0.33309581875801086, + "learning_rate": 1.1264180823125002e-05, + "loss": 0.0265, + "step": 15230 + }, + { + "epoch": 0.9131763436994428, + "grad_norm": 0.48567256331443787, + "learning_rate": 1.1254435100932462e-05, + "loss": 0.0357, + "step": 15240 + }, + { + "epoch": 0.913775540775361, + "grad_norm": 0.8473871946334839, + "learning_rate": 1.124468908014616e-05, + "loss": 0.0355, + "step": 15250 + }, + { + "epoch": 0.9143747378512793, + "grad_norm": 0.43827518820762634, + "learning_rate": 1.123494277220359e-05, + "loss": 0.0266, + "step": 15260 + }, + { + "epoch": 0.9149739349271976, + "grad_norm": 0.5849157571792603, + "learning_rate": 1.1225196188542595e-05, + "loss": 0.0317, + "step": 15270 + }, + { + "epoch": 0.9155731320031159, + "grad_norm": 0.5690399408340454, + "learning_rate": 1.1215449340601328e-05, + "loss": 0.0266, + "step": 15280 + }, + { + "epoch": 0.9161723290790341, + "grad_norm": 0.6484784483909607, + "learning_rate": 1.1205702239818259e-05, + "loss": 0.0294, + "step": 15290 + }, + { + "epoch": 0.9167715261549524, + "grad_norm": 0.8894811272621155, + "learning_rate": 1.1195954897632153e-05, + "loss": 0.0239, + "step": 15300 + }, + { + "epoch": 0.9173707232308707, + "grad_norm": 0.4575272798538208, + "learning_rate": 1.1186207325482062e-05, + "loss": 0.0323, + "step": 15310 + }, + { + "epoch": 0.9179699203067889, + "grad_norm": 0.4288756847381592, + "learning_rate": 1.1176459534807304e-05, + "loss": 0.032, + "step": 15320 + }, + { + "epoch": 0.9185691173827072, + "grad_norm": 0.8871303200721741, + "learning_rate": 1.116671153704745e-05, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9191683144586255, + "grad_norm": 0.5861580967903137, + "learning_rate": 1.1156963343642324e-05, + "loss": 0.0335, + "step": 15340 + }, + { + "epoch": 0.9197675115345437, + "grad_norm": 0.4159319996833801, + "learning_rate": 1.1147214966031968e-05, + "loss": 0.0247, + "step": 15350 + }, + { + "epoch": 0.920366708610462, + "grad_norm": 0.6948496699333191, + "learning_rate": 1.1137466415656653e-05, + "loss": 0.0299, + "step": 15360 + }, + { + "epoch": 0.9209659056863803, + "grad_norm": 0.5089551210403442, + "learning_rate": 1.112771770395684e-05, + "loss": 0.0333, + "step": 15370 + }, + { + "epoch": 0.9215651027622985, + "grad_norm": 0.6912631392478943, + "learning_rate": 1.1117968842373185e-05, + "loss": 0.0303, + "step": 15380 + }, + { + "epoch": 0.9221642998382168, + "grad_norm": 0.6213784217834473, + "learning_rate": 1.1108219842346528e-05, + "loss": 0.0295, + "step": 15390 + }, + { + "epoch": 0.9227634969141351, + "grad_norm": 0.4634060561656952, + "learning_rate": 1.1098470715317855e-05, + "loss": 0.0261, + "step": 15400 + }, + { + "epoch": 0.9233626939900533, + "grad_norm": 0.5664045214653015, + "learning_rate": 1.1088721472728314e-05, + "loss": 0.0262, + "step": 15410 + }, + { + "epoch": 0.9239618910659716, + "grad_norm": 0.7963227033615112, + "learning_rate": 1.1078972126019184e-05, + "loss": 0.0278, + "step": 15420 + }, + { + "epoch": 0.9245610881418899, + "grad_norm": 0.45378491282463074, + "learning_rate": 1.1069222686631866e-05, + "loss": 0.0268, + "step": 15430 + }, + { + "epoch": 0.9251602852178081, + "grad_norm": 0.8970746994018555, + "learning_rate": 1.1059473166007867e-05, + "loss": 0.0271, + "step": 15440 + }, + { + "epoch": 0.9257594822937264, + "grad_norm": 0.5109472274780273, + "learning_rate": 1.1049723575588796e-05, + "loss": 0.0307, + "step": 15450 + }, + { + "epoch": 0.9263586793696447, + "grad_norm": 0.5023297667503357, + "learning_rate": 1.1039973926816338e-05, + "loss": 0.0263, + "step": 15460 + }, + { + "epoch": 0.9269578764455629, + "grad_norm": 0.6055631041526794, + "learning_rate": 1.103022423113225e-05, + "loss": 0.0285, + "step": 15470 + }, + { + "epoch": 0.9275570735214812, + "grad_norm": 0.38602766394615173, + "learning_rate": 1.1020474499978346e-05, + "loss": 0.0282, + "step": 15480 + }, + { + "epoch": 0.9281562705973995, + "grad_norm": 0.5447302460670471, + "learning_rate": 1.1010724744796476e-05, + "loss": 0.0319, + "step": 15490 + }, + { + "epoch": 0.9287554676733177, + "grad_norm": 0.6613780856132507, + "learning_rate": 1.1000974977028517e-05, + "loss": 0.0271, + "step": 15500 + }, + { + "epoch": 0.929354664749236, + "grad_norm": 1.0358555316925049, + "learning_rate": 1.0991225208116372e-05, + "loss": 0.026, + "step": 15510 + }, + { + "epoch": 0.9299538618251543, + "grad_norm": 0.4463629722595215, + "learning_rate": 1.0981475449501935e-05, + "loss": 0.0271, + "step": 15520 + }, + { + "epoch": 0.9305530589010725, + "grad_norm": 0.5373798608779907, + "learning_rate": 1.0971725712627086e-05, + "loss": 0.025, + "step": 15530 + }, + { + "epoch": 0.9311522559769908, + "grad_norm": 0.7735916972160339, + "learning_rate": 1.096197600893369e-05, + "loss": 0.0325, + "step": 15540 + }, + { + "epoch": 0.9317514530529091, + "grad_norm": 0.5017692446708679, + "learning_rate": 1.0952226349863563e-05, + "loss": 0.0262, + "step": 15550 + }, + { + "epoch": 0.9323506501288273, + "grad_norm": 0.3406142592430115, + "learning_rate": 1.0942476746858477e-05, + "loss": 0.0271, + "step": 15560 + }, + { + "epoch": 0.9329498472047456, + "grad_norm": 0.28971537947654724, + "learning_rate": 1.0932727211360133e-05, + "loss": 0.0238, + "step": 15570 + }, + { + "epoch": 0.9335490442806639, + "grad_norm": 0.45441415905952454, + "learning_rate": 1.0922977754810156e-05, + "loss": 0.0261, + "step": 15580 + }, + { + "epoch": 0.9341482413565821, + "grad_norm": 0.4653581976890564, + "learning_rate": 1.0913228388650072e-05, + "loss": 0.026, + "step": 15590 + }, + { + "epoch": 0.9347474384325004, + "grad_norm": 0.5449947714805603, + "learning_rate": 1.0903479124321305e-05, + "loss": 0.0314, + "step": 15600 + }, + { + "epoch": 0.9353466355084187, + "grad_norm": 0.41015395522117615, + "learning_rate": 1.0893729973265164e-05, + "loss": 0.0272, + "step": 15610 + }, + { + "epoch": 0.935945832584337, + "grad_norm": 0.5936392545700073, + "learning_rate": 1.0883980946922816e-05, + "loss": 0.0269, + "step": 15620 + }, + { + "epoch": 0.9365450296602552, + "grad_norm": 0.5043690800666809, + "learning_rate": 1.0874232056735293e-05, + "loss": 0.0256, + "step": 15630 + }, + { + "epoch": 0.9371442267361735, + "grad_norm": 0.6176534295082092, + "learning_rate": 1.0864483314143452e-05, + "loss": 0.0285, + "step": 15640 + }, + { + "epoch": 0.9377434238120917, + "grad_norm": 0.6774734258651733, + "learning_rate": 1.0854734730587995e-05, + "loss": 0.0268, + "step": 15650 + }, + { + "epoch": 0.93834262088801, + "grad_norm": 0.7045454978942871, + "learning_rate": 1.0844986317509418e-05, + "loss": 0.0305, + "step": 15660 + }, + { + "epoch": 0.9389418179639283, + "grad_norm": 0.5905448794364929, + "learning_rate": 1.0835238086348034e-05, + "loss": 0.0284, + "step": 15670 + }, + { + "epoch": 0.9395410150398467, + "grad_norm": 0.7881343364715576, + "learning_rate": 1.0825490048543937e-05, + "loss": 0.0321, + "step": 15680 + }, + { + "epoch": 0.9401402121157649, + "grad_norm": 0.6635507941246033, + "learning_rate": 1.0815742215536983e-05, + "loss": 0.0284, + "step": 15690 + }, + { + "epoch": 0.9407394091916832, + "grad_norm": 0.46298888325691223, + "learning_rate": 1.0805994598766804e-05, + "loss": 0.0394, + "step": 15700 + }, + { + "epoch": 0.9413386062676015, + "grad_norm": 0.5187172889709473, + "learning_rate": 1.0796247209672767e-05, + "loss": 0.0257, + "step": 15710 + }, + { + "epoch": 0.9419378033435197, + "grad_norm": 0.5974661707878113, + "learning_rate": 1.0786500059693982e-05, + "loss": 0.0305, + "step": 15720 + }, + { + "epoch": 0.942537000419438, + "grad_norm": 0.5171123743057251, + "learning_rate": 1.0776753160269267e-05, + "loss": 0.0275, + "step": 15730 + }, + { + "epoch": 0.9431361974953563, + "grad_norm": 0.35988888144493103, + "learning_rate": 1.0767006522837153e-05, + "loss": 0.0295, + "step": 15740 + }, + { + "epoch": 0.9437353945712745, + "grad_norm": 0.30543047189712524, + "learning_rate": 1.0757260158835862e-05, + "loss": 0.0334, + "step": 15750 + }, + { + "epoch": 0.9443345916471928, + "grad_norm": 0.6582810878753662, + "learning_rate": 1.0747514079703296e-05, + "loss": 0.0309, + "step": 15760 + }, + { + "epoch": 0.9449337887231111, + "grad_norm": 0.4986134171485901, + "learning_rate": 1.0737768296877023e-05, + "loss": 0.0294, + "step": 15770 + }, + { + "epoch": 0.9455329857990293, + "grad_norm": 0.5560855269432068, + "learning_rate": 1.0728022821794257e-05, + "loss": 0.0224, + "step": 15780 + }, + { + "epoch": 0.9461321828749476, + "grad_norm": 0.28974607586860657, + "learning_rate": 1.071827766589186e-05, + "loss": 0.0313, + "step": 15790 + }, + { + "epoch": 0.9467313799508659, + "grad_norm": 0.24015791714191437, + "learning_rate": 1.0708532840606312e-05, + "loss": 0.026, + "step": 15800 + }, + { + "epoch": 0.9473305770267841, + "grad_norm": 0.2704199552536011, + "learning_rate": 1.0698788357373713e-05, + "loss": 0.0244, + "step": 15810 + }, + { + "epoch": 0.9479297741027024, + "grad_norm": 0.6661707162857056, + "learning_rate": 1.068904422762975e-05, + "loss": 0.027, + "step": 15820 + }, + { + "epoch": 0.9485289711786207, + "grad_norm": 0.5058556795120239, + "learning_rate": 1.067930046280971e-05, + "loss": 0.0254, + "step": 15830 + }, + { + "epoch": 0.9491281682545389, + "grad_norm": 0.7086800336837769, + "learning_rate": 1.0669557074348438e-05, + "loss": 0.0242, + "step": 15840 + }, + { + "epoch": 0.9497273653304572, + "grad_norm": 0.6752822399139404, + "learning_rate": 1.0659814073680346e-05, + "loss": 0.0262, + "step": 15850 + }, + { + "epoch": 0.9503265624063755, + "grad_norm": 0.8279762268066406, + "learning_rate": 1.0650071472239387e-05, + "loss": 0.0312, + "step": 15860 + }, + { + "epoch": 0.9509257594822937, + "grad_norm": 0.5070614814758301, + "learning_rate": 1.0640329281459045e-05, + "loss": 0.0308, + "step": 15870 + }, + { + "epoch": 0.951524956558212, + "grad_norm": 0.3933897614479065, + "learning_rate": 1.0630587512772325e-05, + "loss": 0.0287, + "step": 15880 + }, + { + "epoch": 0.9521241536341303, + "grad_norm": 0.37238794565200806, + "learning_rate": 1.0620846177611734e-05, + "loss": 0.0325, + "step": 15890 + }, + { + "epoch": 0.9527233507100485, + "grad_norm": 0.7591347098350525, + "learning_rate": 1.0611105287409276e-05, + "loss": 0.0265, + "step": 15900 + }, + { + "epoch": 0.9533225477859668, + "grad_norm": 0.4841652810573578, + "learning_rate": 1.060136485359642e-05, + "loss": 0.0331, + "step": 15910 + }, + { + "epoch": 0.9539217448618851, + "grad_norm": 0.45236295461654663, + "learning_rate": 1.0591624887604115e-05, + "loss": 0.0412, + "step": 15920 + }, + { + "epoch": 0.9545209419378033, + "grad_norm": 0.4774094820022583, + "learning_rate": 1.058188540086275e-05, + "loss": 0.0289, + "step": 15930 + }, + { + "epoch": 0.9551201390137216, + "grad_norm": 0.47564345598220825, + "learning_rate": 1.0572146404802157e-05, + "loss": 0.0294, + "step": 15940 + }, + { + "epoch": 0.9557193360896399, + "grad_norm": 0.341337651014328, + "learning_rate": 1.0562407910851589e-05, + "loss": 0.0281, + "step": 15950 + }, + { + "epoch": 0.9563185331655581, + "grad_norm": 0.341701865196228, + "learning_rate": 1.0552669930439712e-05, + "loss": 0.0224, + "step": 15960 + }, + { + "epoch": 0.9569177302414764, + "grad_norm": 0.6621959209442139, + "learning_rate": 1.0542932474994589e-05, + "loss": 0.0283, + "step": 15970 + }, + { + "epoch": 0.9575169273173947, + "grad_norm": 0.348466694355011, + "learning_rate": 1.0533195555943662e-05, + "loss": 0.0234, + "step": 15980 + }, + { + "epoch": 0.958116124393313, + "grad_norm": 0.35208311676979065, + "learning_rate": 1.0523459184713753e-05, + "loss": 0.0248, + "step": 15990 + }, + { + "epoch": 0.9587153214692312, + "grad_norm": 0.4973156154155731, + "learning_rate": 1.0513723372731031e-05, + "loss": 0.0246, + "step": 16000 + }, + { + "epoch": 0.9593145185451495, + "grad_norm": 0.3668982982635498, + "learning_rate": 1.0503988131421021e-05, + "loss": 0.0228, + "step": 16010 + }, + { + "epoch": 0.9599137156210678, + "grad_norm": 0.4771873950958252, + "learning_rate": 1.0494253472208563e-05, + "loss": 0.0303, + "step": 16020 + }, + { + "epoch": 0.960512912696986, + "grad_norm": 0.3595021665096283, + "learning_rate": 1.0484519406517828e-05, + "loss": 0.0265, + "step": 16030 + }, + { + "epoch": 0.9611121097729043, + "grad_norm": 0.6013099551200867, + "learning_rate": 1.0474785945772278e-05, + "loss": 0.0297, + "step": 16040 + }, + { + "epoch": 0.9617113068488226, + "grad_norm": 0.40996676683425903, + "learning_rate": 1.046505310139468e-05, + "loss": 0.0321, + "step": 16050 + }, + { + "epoch": 0.9623105039247408, + "grad_norm": 0.45742037892341614, + "learning_rate": 1.0455320884807064e-05, + "loss": 0.0288, + "step": 16060 + }, + { + "epoch": 0.9629097010006591, + "grad_norm": 0.8092222213745117, + "learning_rate": 1.0445589307430724e-05, + "loss": 0.0278, + "step": 16070 + }, + { + "epoch": 0.9635088980765774, + "grad_norm": 0.32741186022758484, + "learning_rate": 1.0435858380686215e-05, + "loss": 0.0288, + "step": 16080 + }, + { + "epoch": 0.9641080951524956, + "grad_norm": 0.5716732740402222, + "learning_rate": 1.0426128115993317e-05, + "loss": 0.0256, + "step": 16090 + }, + { + "epoch": 0.9647072922284139, + "grad_norm": 0.3263239562511444, + "learning_rate": 1.0416398524771041e-05, + "loss": 0.0271, + "step": 16100 + }, + { + "epoch": 0.9653064893043322, + "grad_norm": 0.35390567779541016, + "learning_rate": 1.04066696184376e-05, + "loss": 0.0266, + "step": 16110 + }, + { + "epoch": 0.9659056863802504, + "grad_norm": 0.36520150303840637, + "learning_rate": 1.0396941408410413e-05, + "loss": 0.0265, + "step": 16120 + }, + { + "epoch": 0.9665048834561687, + "grad_norm": 0.46227532625198364, + "learning_rate": 1.0387213906106074e-05, + "loss": 0.0305, + "step": 16130 + }, + { + "epoch": 0.967104080532087, + "grad_norm": 0.40079647302627563, + "learning_rate": 1.0377487122940347e-05, + "loss": 0.0327, + "step": 16140 + }, + { + "epoch": 0.9677032776080052, + "grad_norm": 0.3689155578613281, + "learning_rate": 1.0367761070328155e-05, + "loss": 0.0249, + "step": 16150 + }, + { + "epoch": 0.9683024746839235, + "grad_norm": 0.49527907371520996, + "learning_rate": 1.0358035759683563e-05, + "loss": 0.029, + "step": 16160 + }, + { + "epoch": 0.9689016717598418, + "grad_norm": 0.38931334018707275, + "learning_rate": 1.0348311202419765e-05, + "loss": 0.0233, + "step": 16170 + }, + { + "epoch": 0.96950086883576, + "grad_norm": 0.5698918700218201, + "learning_rate": 1.0338587409949068e-05, + "loss": 0.0269, + "step": 16180 + }, + { + "epoch": 0.9701000659116783, + "grad_norm": 1.0959579944610596, + "learning_rate": 1.0328864393682888e-05, + "loss": 0.029, + "step": 16190 + }, + { + "epoch": 0.9706992629875966, + "grad_norm": 0.6321646571159363, + "learning_rate": 1.0319142165031721e-05, + "loss": 0.0276, + "step": 16200 + }, + { + "epoch": 0.9712984600635148, + "grad_norm": 0.7166606783866882, + "learning_rate": 1.0309420735405151e-05, + "loss": 0.0292, + "step": 16210 + }, + { + "epoch": 0.9718976571394332, + "grad_norm": 0.6464444994926453, + "learning_rate": 1.029970011621181e-05, + "loss": 0.0246, + "step": 16220 + }, + { + "epoch": 0.9724968542153515, + "grad_norm": 0.7318128347396851, + "learning_rate": 1.0289980318859392e-05, + "loss": 0.0296, + "step": 16230 + }, + { + "epoch": 0.9730960512912697, + "grad_norm": 0.4828032851219177, + "learning_rate": 1.0280261354754619e-05, + "loss": 0.0247, + "step": 16240 + }, + { + "epoch": 0.973695248367188, + "grad_norm": 0.4509548842906952, + "learning_rate": 1.0270543235303229e-05, + "loss": 0.0241, + "step": 16250 + }, + { + "epoch": 0.9742944454431063, + "grad_norm": 0.413630872964859, + "learning_rate": 1.0260825971909992e-05, + "loss": 0.0313, + "step": 16260 + }, + { + "epoch": 0.9748936425190246, + "grad_norm": 0.42443349957466125, + "learning_rate": 1.0251109575978642e-05, + "loss": 0.0316, + "step": 16270 + }, + { + "epoch": 0.9754928395949428, + "grad_norm": 0.8199112415313721, + "learning_rate": 1.024139405891192e-05, + "loss": 0.0389, + "step": 16280 + }, + { + "epoch": 0.9760920366708611, + "grad_norm": 0.28918105363845825, + "learning_rate": 1.023167943211152e-05, + "loss": 0.0242, + "step": 16290 + }, + { + "epoch": 0.9766912337467794, + "grad_norm": 0.6759344339370728, + "learning_rate": 1.0221965706978108e-05, + "loss": 0.0308, + "step": 16300 + }, + { + "epoch": 0.9772904308226976, + "grad_norm": 0.5480250120162964, + "learning_rate": 1.0212252894911272e-05, + "loss": 0.025, + "step": 16310 + }, + { + "epoch": 0.9778896278986159, + "grad_norm": 0.48897549510002136, + "learning_rate": 1.0202541007309543e-05, + "loss": 0.027, + "step": 16320 + }, + { + "epoch": 0.9784888249745342, + "grad_norm": 0.6111220121383667, + "learning_rate": 1.0192830055570363e-05, + "loss": 0.0276, + "step": 16330 + }, + { + "epoch": 0.9790880220504524, + "grad_norm": 0.8852546215057373, + "learning_rate": 1.0183120051090065e-05, + "loss": 0.0251, + "step": 16340 + }, + { + "epoch": 0.9796872191263707, + "grad_norm": 0.5098162889480591, + "learning_rate": 1.0173411005263891e-05, + "loss": 0.022, + "step": 16350 + }, + { + "epoch": 0.980286416202289, + "grad_norm": 0.45974940061569214, + "learning_rate": 1.016370292948594e-05, + "loss": 0.0206, + "step": 16360 + }, + { + "epoch": 0.9808856132782072, + "grad_norm": 0.3925095200538635, + "learning_rate": 1.0153995835149188e-05, + "loss": 0.0251, + "step": 16370 + }, + { + "epoch": 0.9814848103541255, + "grad_norm": 0.5461363792419434, + "learning_rate": 1.0144289733645443e-05, + "loss": 0.0217, + "step": 16380 + }, + { + "epoch": 0.9820840074300438, + "grad_norm": 0.5685333609580994, + "learning_rate": 1.013458463636536e-05, + "loss": 0.0231, + "step": 16390 + }, + { + "epoch": 0.982683204505962, + "grad_norm": 0.494150310754776, + "learning_rate": 1.0124880554698406e-05, + "loss": 0.0243, + "step": 16400 + }, + { + "epoch": 0.9832824015818803, + "grad_norm": 0.8770614862442017, + "learning_rate": 1.011517750003287e-05, + "loss": 0.0286, + "step": 16410 + }, + { + "epoch": 0.9838815986577986, + "grad_norm": 0.27142134308815, + "learning_rate": 1.0105475483755817e-05, + "loss": 0.0253, + "step": 16420 + }, + { + "epoch": 0.9844807957337168, + "grad_norm": 0.3365682363510132, + "learning_rate": 1.0095774517253114e-05, + "loss": 0.0241, + "step": 16430 + }, + { + "epoch": 0.9850799928096351, + "grad_norm": 0.5512370467185974, + "learning_rate": 1.008607461190938e-05, + "loss": 0.0242, + "step": 16440 + }, + { + "epoch": 0.9856791898855534, + "grad_norm": 0.5581703782081604, + "learning_rate": 1.007637577910799e-05, + "loss": 0.0276, + "step": 16450 + }, + { + "epoch": 0.9862783869614716, + "grad_norm": 0.306773841381073, + "learning_rate": 1.0066678030231071e-05, + "loss": 0.0262, + "step": 16460 + }, + { + "epoch": 0.9868775840373899, + "grad_norm": 0.44620928168296814, + "learning_rate": 1.005698137665947e-05, + "loss": 0.0229, + "step": 16470 + }, + { + "epoch": 0.9874767811133082, + "grad_norm": 0.5870804786682129, + "learning_rate": 1.004728582977275e-05, + "loss": 0.0228, + "step": 16480 + }, + { + "epoch": 0.9880759781892264, + "grad_norm": 0.26162099838256836, + "learning_rate": 1.0037591400949174e-05, + "loss": 0.0278, + "step": 16490 + }, + { + "epoch": 0.9886751752651447, + "grad_norm": 0.27250319719314575, + "learning_rate": 1.0027898101565693e-05, + "loss": 0.0293, + "step": 16500 + }, + { + "epoch": 0.989274372341063, + "grad_norm": 0.8330137729644775, + "learning_rate": 1.0018205942997938e-05, + "loss": 0.0315, + "step": 16510 + }, + { + "epoch": 0.9898735694169812, + "grad_norm": 0.5206989645957947, + "learning_rate": 1.0008514936620197e-05, + "loss": 0.0282, + "step": 16520 + }, + { + "epoch": 0.9904727664928995, + "grad_norm": 0.5408382415771484, + "learning_rate": 9.998825093805402e-06, + "loss": 0.0359, + "step": 16530 + }, + { + "epoch": 0.9910719635688178, + "grad_norm": 0.30517199635505676, + "learning_rate": 9.989136425925123e-06, + "loss": 0.0267, + "step": 16540 + }, + { + "epoch": 0.991671160644736, + "grad_norm": 0.5315027236938477, + "learning_rate": 9.979448944349555e-06, + "loss": 0.0206, + "step": 16550 + }, + { + "epoch": 0.9922703577206543, + "grad_norm": 0.46061626076698303, + "learning_rate": 9.969762660447491e-06, + "loss": 0.0222, + "step": 16560 + }, + { + "epoch": 0.9928695547965726, + "grad_norm": 0.47393080592155457, + "learning_rate": 9.960077585586335e-06, + "loss": 0.0262, + "step": 16570 + }, + { + "epoch": 0.9934687518724908, + "grad_norm": 0.3686772882938385, + "learning_rate": 9.950393731132051e-06, + "loss": 0.0254, + "step": 16580 + }, + { + "epoch": 0.9940679489484091, + "grad_norm": 0.3312757611274719, + "learning_rate": 9.94071110844919e-06, + "loss": 0.0243, + "step": 16590 + }, + { + "epoch": 0.9946671460243274, + "grad_norm": 0.565447986125946, + "learning_rate": 9.931029728900841e-06, + "loss": 0.0267, + "step": 16600 + }, + { + "epoch": 0.9952663431002456, + "grad_norm": 0.5690101385116577, + "learning_rate": 9.921349603848651e-06, + "loss": 0.0237, + "step": 16610 + }, + { + "epoch": 0.9958655401761639, + "grad_norm": 0.44088438153266907, + "learning_rate": 9.911670744652783e-06, + "loss": 0.028, + "step": 16620 + }, + { + "epoch": 0.9964647372520822, + "grad_norm": 0.3708919882774353, + "learning_rate": 9.901993162671912e-06, + "loss": 0.0265, + "step": 16630 + }, + { + "epoch": 0.9970639343280004, + "grad_norm": 0.589698851108551, + "learning_rate": 9.892316869263226e-06, + "loss": 0.0297, + "step": 16640 + }, + { + "epoch": 0.9976631314039187, + "grad_norm": 0.6541375517845154, + "learning_rate": 9.882641875782389e-06, + "loss": 0.0288, + "step": 16650 + }, + { + "epoch": 0.998262328479837, + "grad_norm": 0.5304558873176575, + "learning_rate": 9.87296819358355e-06, + "loss": 0.0243, + "step": 16660 + }, + { + "epoch": 0.9988615255557552, + "grad_norm": 0.5774737000465393, + "learning_rate": 9.863295834019308e-06, + "loss": 0.0277, + "step": 16670 + }, + { + "epoch": 0.9994607226316735, + "grad_norm": 0.5616280436515808, + "learning_rate": 9.853624808440722e-06, + "loss": 0.0267, + "step": 16680 + }, + { + "epoch": 1.000059919707592, + "grad_norm": 0.6129759550094604, + "learning_rate": 9.843955128197274e-06, + "loss": 0.0223, + "step": 16690 + }, + { + "epoch": 1.0006591167835102, + "grad_norm": 0.45278221368789673, + "learning_rate": 9.834286804636876e-06, + "loss": 0.0304, + "step": 16700 + }, + { + "epoch": 1.0012583138594284, + "grad_norm": 0.44487202167510986, + "learning_rate": 9.824619849105848e-06, + "loss": 0.0296, + "step": 16710 + }, + { + "epoch": 1.0018575109353467, + "grad_norm": 0.5391712188720703, + "learning_rate": 9.814954272948889e-06, + "loss": 0.0256, + "step": 16720 + }, + { + "epoch": 1.002456708011265, + "grad_norm": 0.43523359298706055, + "learning_rate": 9.805290087509098e-06, + "loss": 0.0277, + "step": 16730 + }, + { + "epoch": 1.0030559050871832, + "grad_norm": 0.5308435559272766, + "learning_rate": 9.795627304127936e-06, + "loss": 0.0242, + "step": 16740 + }, + { + "epoch": 1.0036551021631015, + "grad_norm": 0.3361283540725708, + "learning_rate": 9.785965934145216e-06, + "loss": 0.0236, + "step": 16750 + }, + { + "epoch": 1.0042542992390198, + "grad_norm": 0.3764631450176239, + "learning_rate": 9.77630598889909e-06, + "loss": 0.0304, + "step": 16760 + }, + { + "epoch": 1.004853496314938, + "grad_norm": 0.9003425240516663, + "learning_rate": 9.76664747972605e-06, + "loss": 0.0278, + "step": 16770 + }, + { + "epoch": 1.0054526933908563, + "grad_norm": 0.2787775993347168, + "learning_rate": 9.75699041796089e-06, + "loss": 0.0219, + "step": 16780 + }, + { + "epoch": 1.0060518904667746, + "grad_norm": 0.40089285373687744, + "learning_rate": 9.74733481493671e-06, + "loss": 0.0284, + "step": 16790 + }, + { + "epoch": 1.0066510875426928, + "grad_norm": 0.3619711101055145, + "learning_rate": 9.737680681984893e-06, + "loss": 0.0252, + "step": 16800 + }, + { + "epoch": 1.007250284618611, + "grad_norm": 0.7354542016983032, + "learning_rate": 9.728028030435114e-06, + "loss": 0.0242, + "step": 16810 + }, + { + "epoch": 1.0078494816945294, + "grad_norm": 0.3854006826877594, + "learning_rate": 9.71837687161529e-06, + "loss": 0.0302, + "step": 16820 + }, + { + "epoch": 1.0084486787704476, + "grad_norm": 0.3318389058113098, + "learning_rate": 9.708727216851588e-06, + "loss": 0.0265, + "step": 16830 + }, + { + "epoch": 1.009047875846366, + "grad_norm": 0.5286651849746704, + "learning_rate": 9.699079077468423e-06, + "loss": 0.0235, + "step": 16840 + }, + { + "epoch": 1.0096470729222842, + "grad_norm": 0.24921932816505432, + "learning_rate": 9.68943246478842e-06, + "loss": 0.0259, + "step": 16850 + }, + { + "epoch": 1.0102462699982024, + "grad_norm": 0.7376067042350769, + "learning_rate": 9.67978739013242e-06, + "loss": 0.0238, + "step": 16860 + }, + { + "epoch": 1.0108454670741207, + "grad_norm": 0.35099226236343384, + "learning_rate": 9.670143864819452e-06, + "loss": 0.0257, + "step": 16870 + }, + { + "epoch": 1.011444664150039, + "grad_norm": 0.3805389702320099, + "learning_rate": 9.660501900166734e-06, + "loss": 0.0198, + "step": 16880 + }, + { + "epoch": 1.0120438612259572, + "grad_norm": 0.4433703124523163, + "learning_rate": 9.650861507489642e-06, + "loss": 0.0241, + "step": 16890 + }, + { + "epoch": 1.0126430583018755, + "grad_norm": 0.3667793571949005, + "learning_rate": 9.641222698101725e-06, + "loss": 0.0268, + "step": 16900 + }, + { + "epoch": 1.0132422553777938, + "grad_norm": 0.2963331639766693, + "learning_rate": 9.63158548331465e-06, + "loss": 0.0223, + "step": 16910 + }, + { + "epoch": 1.013841452453712, + "grad_norm": 0.9817414879798889, + "learning_rate": 9.621949874438232e-06, + "loss": 0.0248, + "step": 16920 + }, + { + "epoch": 1.0144406495296303, + "grad_norm": 0.6529688835144043, + "learning_rate": 9.612315882780393e-06, + "loss": 0.032, + "step": 16930 + }, + { + "epoch": 1.0150398466055486, + "grad_norm": 0.7663154602050781, + "learning_rate": 9.602683519647158e-06, + "loss": 0.0267, + "step": 16940 + }, + { + "epoch": 1.0156390436814668, + "grad_norm": 0.6086964011192322, + "learning_rate": 9.593052796342643e-06, + "loss": 0.0281, + "step": 16950 + }, + { + "epoch": 1.0162382407573851, + "grad_norm": 0.5240464806556702, + "learning_rate": 9.58342372416904e-06, + "loss": 0.0339, + "step": 16960 + }, + { + "epoch": 1.0168374378333034, + "grad_norm": 0.6558368802070618, + "learning_rate": 9.5737963144266e-06, + "loss": 0.0284, + "step": 16970 + }, + { + "epoch": 1.0174366349092216, + "grad_norm": 0.6192268133163452, + "learning_rate": 9.564170578413623e-06, + "loss": 0.0309, + "step": 16980 + }, + { + "epoch": 1.01803583198514, + "grad_norm": 0.5293763875961304, + "learning_rate": 9.554546527426454e-06, + "loss": 0.0257, + "step": 16990 + }, + { + "epoch": 1.0186350290610582, + "grad_norm": 0.38831329345703125, + "learning_rate": 9.54492417275944e-06, + "loss": 0.0239, + "step": 17000 + }, + { + "epoch": 1.0192342261369765, + "grad_norm": 1.12827467918396, + "learning_rate": 9.535303525704958e-06, + "loss": 0.0323, + "step": 17010 + }, + { + "epoch": 1.0198334232128947, + "grad_norm": 0.411818265914917, + "learning_rate": 9.525684597553371e-06, + "loss": 0.0274, + "step": 17020 + }, + { + "epoch": 1.020432620288813, + "grad_norm": 0.5521355867385864, + "learning_rate": 9.51606739959303e-06, + "loss": 0.0233, + "step": 17030 + }, + { + "epoch": 1.0210318173647313, + "grad_norm": 0.26673075556755066, + "learning_rate": 9.506451943110247e-06, + "loss": 0.0317, + "step": 17040 + }, + { + "epoch": 1.0216310144406495, + "grad_norm": 0.5205486416816711, + "learning_rate": 9.496838239389303e-06, + "loss": 0.0273, + "step": 17050 + }, + { + "epoch": 1.0222302115165678, + "grad_norm": 0.8010990619659424, + "learning_rate": 9.487226299712409e-06, + "loss": 0.0292, + "step": 17060 + }, + { + "epoch": 1.022829408592486, + "grad_norm": 0.420612633228302, + "learning_rate": 9.477616135359713e-06, + "loss": 0.0274, + "step": 17070 + }, + { + "epoch": 1.0234286056684043, + "grad_norm": 0.4811270236968994, + "learning_rate": 9.46800775760928e-06, + "loss": 0.0277, + "step": 17080 + }, + { + "epoch": 1.0240278027443226, + "grad_norm": 0.4959382712841034, + "learning_rate": 9.458401177737077e-06, + "loss": 0.0288, + "step": 17090 + }, + { + "epoch": 1.0246269998202409, + "grad_norm": 0.4607725739479065, + "learning_rate": 9.448796407016959e-06, + "loss": 0.0245, + "step": 17100 + }, + { + "epoch": 1.0252261968961591, + "grad_norm": 0.9101414680480957, + "learning_rate": 9.439193456720655e-06, + "loss": 0.0283, + "step": 17110 + }, + { + "epoch": 1.0258253939720774, + "grad_norm": 0.38626620173454285, + "learning_rate": 9.42959233811777e-06, + "loss": 0.026, + "step": 17120 + }, + { + "epoch": 1.0264245910479957, + "grad_norm": 0.5709372758865356, + "learning_rate": 9.419993062475743e-06, + "loss": 0.021, + "step": 17130 + }, + { + "epoch": 1.027023788123914, + "grad_norm": 0.4417913854122162, + "learning_rate": 9.410395641059868e-06, + "loss": 0.0291, + "step": 17140 + }, + { + "epoch": 1.0276229851998322, + "grad_norm": 0.5651213526725769, + "learning_rate": 9.400800085133245e-06, + "loss": 0.0228, + "step": 17150 + }, + { + "epoch": 1.0282221822757505, + "grad_norm": 0.4716165363788605, + "learning_rate": 9.391206405956804e-06, + "loss": 0.0242, + "step": 17160 + }, + { + "epoch": 1.0288213793516687, + "grad_norm": 0.9120892286300659, + "learning_rate": 9.381614614789247e-06, + "loss": 0.0296, + "step": 17170 + }, + { + "epoch": 1.029420576427587, + "grad_norm": 0.5004292130470276, + "learning_rate": 9.372024722887089e-06, + "loss": 0.033, + "step": 17180 + }, + { + "epoch": 1.0300197735035053, + "grad_norm": 0.3422714173793793, + "learning_rate": 9.362436741504598e-06, + "loss": 0.0284, + "step": 17190 + }, + { + "epoch": 1.0306189705794235, + "grad_norm": 0.5391610264778137, + "learning_rate": 9.352850681893798e-06, + "loss": 0.0362, + "step": 17200 + }, + { + "epoch": 1.0312181676553418, + "grad_norm": 0.5446203351020813, + "learning_rate": 9.343266555304471e-06, + "loss": 0.0247, + "step": 17210 + }, + { + "epoch": 1.03181736473126, + "grad_norm": 0.5441875457763672, + "learning_rate": 9.333684372984119e-06, + "loss": 0.0284, + "step": 17220 + }, + { + "epoch": 1.0324165618071783, + "grad_norm": 0.48274070024490356, + "learning_rate": 9.324104146177972e-06, + "loss": 0.0245, + "step": 17230 + }, + { + "epoch": 1.0330157588830966, + "grad_norm": 0.6035326719284058, + "learning_rate": 9.314525886128956e-06, + "loss": 0.0226, + "step": 17240 + }, + { + "epoch": 1.0336149559590149, + "grad_norm": 0.3104001581668854, + "learning_rate": 9.304949604077693e-06, + "loss": 0.029, + "step": 17250 + }, + { + "epoch": 1.0342141530349331, + "grad_norm": 0.27859869599342346, + "learning_rate": 9.295375311262483e-06, + "loss": 0.022, + "step": 17260 + }, + { + "epoch": 1.0348133501108514, + "grad_norm": 0.3896406292915344, + "learning_rate": 9.285803018919292e-06, + "loss": 0.0235, + "step": 17270 + }, + { + "epoch": 1.0354125471867697, + "grad_norm": 0.4526473581790924, + "learning_rate": 9.276232738281744e-06, + "loss": 0.0289, + "step": 17280 + }, + { + "epoch": 1.036011744262688, + "grad_norm": 0.6624506115913391, + "learning_rate": 9.266664480581085e-06, + "loss": 0.0265, + "step": 17290 + }, + { + "epoch": 1.0366109413386062, + "grad_norm": 0.6976125836372375, + "learning_rate": 9.257098257046206e-06, + "loss": 0.029, + "step": 17300 + }, + { + "epoch": 1.0372101384145245, + "grad_norm": 0.5974310040473938, + "learning_rate": 9.247534078903601e-06, + "loss": 0.0205, + "step": 17310 + }, + { + "epoch": 1.0378093354904427, + "grad_norm": 0.7627739906311035, + "learning_rate": 9.23797195737737e-06, + "loss": 0.0333, + "step": 17320 + }, + { + "epoch": 1.038408532566361, + "grad_norm": 0.3166525065898895, + "learning_rate": 9.228411903689187e-06, + "loss": 0.0309, + "step": 17330 + }, + { + "epoch": 1.0390077296422793, + "grad_norm": 0.41519322991371155, + "learning_rate": 9.218853929058316e-06, + "loss": 0.0223, + "step": 17340 + }, + { + "epoch": 1.0396069267181975, + "grad_norm": 0.31840237975120544, + "learning_rate": 9.209298044701567e-06, + "loss": 0.0239, + "step": 17350 + }, + { + "epoch": 1.0402061237941158, + "grad_norm": 0.47412827610969543, + "learning_rate": 9.199744261833305e-06, + "loss": 0.0228, + "step": 17360 + }, + { + "epoch": 1.040805320870034, + "grad_norm": 0.41170552372932434, + "learning_rate": 9.19019259166543e-06, + "loss": 0.0209, + "step": 17370 + }, + { + "epoch": 1.0414045179459523, + "grad_norm": 0.45858854055404663, + "learning_rate": 9.18064304540735e-06, + "loss": 0.0243, + "step": 17380 + }, + { + "epoch": 1.0420037150218706, + "grad_norm": 0.7870534658432007, + "learning_rate": 9.171095634265995e-06, + "loss": 0.027, + "step": 17390 + }, + { + "epoch": 1.0426029120977889, + "grad_norm": 0.4080354869365692, + "learning_rate": 9.161550369445782e-06, + "loss": 0.023, + "step": 17400 + }, + { + "epoch": 1.0432021091737071, + "grad_norm": 0.47916823625564575, + "learning_rate": 9.152007262148612e-06, + "loss": 0.0303, + "step": 17410 + }, + { + "epoch": 1.0438013062496254, + "grad_norm": 0.6911760568618774, + "learning_rate": 9.142466323573853e-06, + "loss": 0.0263, + "step": 17420 + }, + { + "epoch": 1.0444005033255437, + "grad_norm": 0.3980148732662201, + "learning_rate": 9.132927564918328e-06, + "loss": 0.028, + "step": 17430 + }, + { + "epoch": 1.044999700401462, + "grad_norm": 0.47085851430892944, + "learning_rate": 9.1233909973763e-06, + "loss": 0.0266, + "step": 17440 + }, + { + "epoch": 1.0455988974773802, + "grad_norm": 0.5085862874984741, + "learning_rate": 9.113856632139466e-06, + "loss": 0.0239, + "step": 17450 + }, + { + "epoch": 1.0461980945532985, + "grad_norm": 0.5219245553016663, + "learning_rate": 9.104324480396934e-06, + "loss": 0.0267, + "step": 17460 + }, + { + "epoch": 1.0467972916292168, + "grad_norm": 0.5199264287948608, + "learning_rate": 9.09479455333521e-06, + "loss": 0.0277, + "step": 17470 + }, + { + "epoch": 1.047396488705135, + "grad_norm": 0.6157195568084717, + "learning_rate": 9.085266862138197e-06, + "loss": 0.0343, + "step": 17480 + }, + { + "epoch": 1.0479956857810533, + "grad_norm": 0.5366696715354919, + "learning_rate": 9.07574141798717e-06, + "loss": 0.0271, + "step": 17490 + }, + { + "epoch": 1.0485948828569716, + "grad_norm": 0.3640076220035553, + "learning_rate": 9.066218232060774e-06, + "loss": 0.0258, + "step": 17500 + }, + { + "epoch": 1.0491940799328898, + "grad_norm": 0.5320505499839783, + "learning_rate": 9.05669731553499e-06, + "loss": 0.024, + "step": 17510 + }, + { + "epoch": 1.049793277008808, + "grad_norm": 0.507826566696167, + "learning_rate": 9.047178679583151e-06, + "loss": 0.0253, + "step": 17520 + }, + { + "epoch": 1.0503924740847266, + "grad_norm": 0.741392195224762, + "learning_rate": 9.037662335375902e-06, + "loss": 0.0242, + "step": 17530 + }, + { + "epoch": 1.0509916711606446, + "grad_norm": 0.5325136184692383, + "learning_rate": 9.028148294081203e-06, + "loss": 0.0224, + "step": 17540 + }, + { + "epoch": 1.0515908682365631, + "grad_norm": 0.4709665775299072, + "learning_rate": 9.018636566864313e-06, + "loss": 0.026, + "step": 17550 + }, + { + "epoch": 1.0521900653124814, + "grad_norm": 0.4371986985206604, + "learning_rate": 9.00912716488778e-06, + "loss": 0.0264, + "step": 17560 + }, + { + "epoch": 1.0527892623883996, + "grad_norm": 0.47594818472862244, + "learning_rate": 8.999620099311405e-06, + "loss": 0.0224, + "step": 17570 + }, + { + "epoch": 1.053388459464318, + "grad_norm": 0.488423228263855, + "learning_rate": 8.990115381292264e-06, + "loss": 0.0261, + "step": 17580 + }, + { + "epoch": 1.0539876565402362, + "grad_norm": 0.24745763838291168, + "learning_rate": 8.980613021984675e-06, + "loss": 0.0206, + "step": 17590 + }, + { + "epoch": 1.0545868536161545, + "grad_norm": 0.5042629837989807, + "learning_rate": 8.97111303254018e-06, + "loss": 0.0305, + "step": 17600 + }, + { + "epoch": 1.0551860506920727, + "grad_norm": 0.5255836844444275, + "learning_rate": 8.961615424107555e-06, + "loss": 0.026, + "step": 17610 + }, + { + "epoch": 1.055785247767991, + "grad_norm": 0.4605107307434082, + "learning_rate": 8.952120207832764e-06, + "loss": 0.0274, + "step": 17620 + }, + { + "epoch": 1.0563844448439093, + "grad_norm": 0.3252561390399933, + "learning_rate": 8.942627394858978e-06, + "loss": 0.0227, + "step": 17630 + }, + { + "epoch": 1.0569836419198275, + "grad_norm": 0.35779184103012085, + "learning_rate": 8.933136996326539e-06, + "loss": 0.0296, + "step": 17640 + }, + { + "epoch": 1.0575828389957458, + "grad_norm": 0.2960403263568878, + "learning_rate": 8.923649023372962e-06, + "loss": 0.0212, + "step": 17650 + }, + { + "epoch": 1.058182036071664, + "grad_norm": 0.6344659328460693, + "learning_rate": 8.914163487132906e-06, + "loss": 0.026, + "step": 17660 + }, + { + "epoch": 1.0587812331475823, + "grad_norm": 0.4614463150501251, + "learning_rate": 8.904680398738176e-06, + "loss": 0.0234, + "step": 17670 + }, + { + "epoch": 1.0593804302235006, + "grad_norm": 0.4490053951740265, + "learning_rate": 8.895199769317711e-06, + "loss": 0.0265, + "step": 17680 + }, + { + "epoch": 1.0599796272994189, + "grad_norm": 0.5291271209716797, + "learning_rate": 8.885721609997551e-06, + "loss": 0.0326, + "step": 17690 + }, + { + "epoch": 1.0605788243753371, + "grad_norm": 0.5311887264251709, + "learning_rate": 8.876245931900847e-06, + "loss": 0.0257, + "step": 17700 + }, + { + "epoch": 1.0611780214512554, + "grad_norm": 0.5647584199905396, + "learning_rate": 8.866772746147833e-06, + "loss": 0.0295, + "step": 17710 + }, + { + "epoch": 1.0617772185271737, + "grad_norm": 0.3913862705230713, + "learning_rate": 8.857302063855826e-06, + "loss": 0.0256, + "step": 17720 + }, + { + "epoch": 1.062376415603092, + "grad_norm": 0.4476219415664673, + "learning_rate": 8.847833896139193e-06, + "loss": 0.0248, + "step": 17730 + }, + { + "epoch": 1.0629756126790102, + "grad_norm": 0.7807655930519104, + "learning_rate": 8.83836825410936e-06, + "loss": 0.026, + "step": 17740 + }, + { + "epoch": 1.0635748097549285, + "grad_norm": 0.38984328508377075, + "learning_rate": 8.828905148874785e-06, + "loss": 0.0247, + "step": 17750 + }, + { + "epoch": 1.0641740068308467, + "grad_norm": 0.5757346153259277, + "learning_rate": 8.819444591540942e-06, + "loss": 0.0296, + "step": 17760 + }, + { + "epoch": 1.064773203906765, + "grad_norm": 0.25636178255081177, + "learning_rate": 8.809986593210331e-06, + "loss": 0.0222, + "step": 17770 + }, + { + "epoch": 1.0653724009826833, + "grad_norm": 0.45617344975471497, + "learning_rate": 8.800531164982436e-06, + "loss": 0.0224, + "step": 17780 + }, + { + "epoch": 1.0659715980586015, + "grad_norm": 0.3066493272781372, + "learning_rate": 8.791078317953728e-06, + "loss": 0.0237, + "step": 17790 + }, + { + "epoch": 1.0665707951345198, + "grad_norm": 0.26513972878456116, + "learning_rate": 8.78162806321765e-06, + "loss": 0.0277, + "step": 17800 + }, + { + "epoch": 1.067169992210438, + "grad_norm": 0.445230633020401, + "learning_rate": 8.772180411864604e-06, + "loss": 0.0248, + "step": 17810 + }, + { + "epoch": 1.0677691892863563, + "grad_norm": 0.4914413392543793, + "learning_rate": 8.762735374981932e-06, + "loss": 0.022, + "step": 17820 + }, + { + "epoch": 1.0683683863622746, + "grad_norm": 0.41469570994377136, + "learning_rate": 8.753292963653915e-06, + "loss": 0.0245, + "step": 17830 + }, + { + "epoch": 1.0689675834381929, + "grad_norm": 0.33235347270965576, + "learning_rate": 8.743853188961749e-06, + "loss": 0.0229, + "step": 17840 + }, + { + "epoch": 1.0695667805141111, + "grad_norm": 0.4890037775039673, + "learning_rate": 8.734416061983528e-06, + "loss": 0.0247, + "step": 17850 + }, + { + "epoch": 1.0701659775900294, + "grad_norm": 0.41330578923225403, + "learning_rate": 8.724981593794253e-06, + "loss": 0.0285, + "step": 17860 + }, + { + "epoch": 1.0707651746659477, + "grad_norm": 0.6309427618980408, + "learning_rate": 8.715549795465797e-06, + "loss": 0.0233, + "step": 17870 + }, + { + "epoch": 1.071364371741866, + "grad_norm": 0.42090296745300293, + "learning_rate": 8.706120678066895e-06, + "loss": 0.0254, + "step": 17880 + }, + { + "epoch": 1.0719635688177842, + "grad_norm": 0.5888519287109375, + "learning_rate": 8.69669425266315e-06, + "loss": 0.0262, + "step": 17890 + }, + { + "epoch": 1.0725627658937025, + "grad_norm": 0.5488774180412292, + "learning_rate": 8.687270530316993e-06, + "loss": 0.0262, + "step": 17900 + }, + { + "epoch": 1.0731619629696207, + "grad_norm": 0.48015111684799194, + "learning_rate": 8.677849522087687e-06, + "loss": 0.0219, + "step": 17910 + }, + { + "epoch": 1.073761160045539, + "grad_norm": 0.4484168291091919, + "learning_rate": 8.66843123903131e-06, + "loss": 0.0276, + "step": 17920 + }, + { + "epoch": 1.0743603571214573, + "grad_norm": 0.4128018319606781, + "learning_rate": 8.659015692200741e-06, + "loss": 0.0218, + "step": 17930 + }, + { + "epoch": 1.0749595541973755, + "grad_norm": 0.5151517987251282, + "learning_rate": 8.649602892645654e-06, + "loss": 0.0242, + "step": 17940 + }, + { + "epoch": 1.0755587512732938, + "grad_norm": 0.6248350143432617, + "learning_rate": 8.640192851412488e-06, + "loss": 0.0267, + "step": 17950 + }, + { + "epoch": 1.076157948349212, + "grad_norm": 0.4116908013820648, + "learning_rate": 8.630785579544448e-06, + "loss": 0.0242, + "step": 17960 + }, + { + "epoch": 1.0767571454251303, + "grad_norm": 0.6138579249382019, + "learning_rate": 8.621381088081499e-06, + "loss": 0.0282, + "step": 17970 + }, + { + "epoch": 1.0773563425010486, + "grad_norm": 0.22843605279922485, + "learning_rate": 8.611979388060327e-06, + "loss": 0.0284, + "step": 17980 + }, + { + "epoch": 1.0779555395769669, + "grad_norm": 0.49555841088294983, + "learning_rate": 8.60258049051436e-06, + "loss": 0.0244, + "step": 17990 + }, + { + "epoch": 1.0785547366528851, + "grad_norm": 0.5752411484718323, + "learning_rate": 8.593184406473722e-06, + "loss": 0.0275, + "step": 18000 + }, + { + "epoch": 1.0791539337288034, + "grad_norm": 0.5129706859588623, + "learning_rate": 8.583791146965244e-06, + "loss": 0.0237, + "step": 18010 + }, + { + "epoch": 1.0797531308047217, + "grad_norm": 0.751230001449585, + "learning_rate": 8.574400723012433e-06, + "loss": 0.0257, + "step": 18020 + }, + { + "epoch": 1.08035232788064, + "grad_norm": 0.47749435901641846, + "learning_rate": 8.565013145635486e-06, + "loss": 0.0277, + "step": 18030 + }, + { + "epoch": 1.0809515249565582, + "grad_norm": 0.21702095866203308, + "learning_rate": 8.555628425851237e-06, + "loss": 0.0255, + "step": 18040 + }, + { + "epoch": 1.0815507220324765, + "grad_norm": 0.30658838152885437, + "learning_rate": 8.54624657467318e-06, + "loss": 0.024, + "step": 18050 + }, + { + "epoch": 1.0821499191083948, + "grad_norm": 0.3589625954627991, + "learning_rate": 8.536867603111446e-06, + "loss": 0.0215, + "step": 18060 + }, + { + "epoch": 1.082749116184313, + "grad_norm": 0.5434426069259644, + "learning_rate": 8.52749152217277e-06, + "loss": 0.0224, + "step": 18070 + }, + { + "epoch": 1.0833483132602313, + "grad_norm": 0.8732438683509827, + "learning_rate": 8.518118342860516e-06, + "loss": 0.0289, + "step": 18080 + }, + { + "epoch": 1.0839475103361496, + "grad_norm": 0.34988290071487427, + "learning_rate": 8.508748076174625e-06, + "loss": 0.0226, + "step": 18090 + }, + { + "epoch": 1.0845467074120678, + "grad_norm": 0.4021032154560089, + "learning_rate": 8.499380733111628e-06, + "loss": 0.0248, + "step": 18100 + }, + { + "epoch": 1.085145904487986, + "grad_norm": 0.4676196873188019, + "learning_rate": 8.490016324664626e-06, + "loss": 0.0235, + "step": 18110 + }, + { + "epoch": 1.0857451015639044, + "grad_norm": 0.41646474599838257, + "learning_rate": 8.480654861823275e-06, + "loss": 0.0235, + "step": 18120 + }, + { + "epoch": 1.0863442986398226, + "grad_norm": 0.5892519950866699, + "learning_rate": 8.471296355573768e-06, + "loss": 0.0221, + "step": 18130 + }, + { + "epoch": 1.086943495715741, + "grad_norm": 0.5757095217704773, + "learning_rate": 8.461940816898838e-06, + "loss": 0.0258, + "step": 18140 + }, + { + "epoch": 1.0875426927916592, + "grad_norm": 0.4664652645587921, + "learning_rate": 8.452588256777728e-06, + "loss": 0.0275, + "step": 18150 + }, + { + "epoch": 1.0881418898675774, + "grad_norm": 0.4674879014492035, + "learning_rate": 8.443238686186186e-06, + "loss": 0.0285, + "step": 18160 + }, + { + "epoch": 1.0887410869434957, + "grad_norm": 0.7277936339378357, + "learning_rate": 8.433892116096462e-06, + "loss": 0.0316, + "step": 18170 + }, + { + "epoch": 1.089340284019414, + "grad_norm": 0.40373867750167847, + "learning_rate": 8.424548557477269e-06, + "loss": 0.0213, + "step": 18180 + }, + { + "epoch": 1.0899394810953322, + "grad_norm": 0.8632686138153076, + "learning_rate": 8.415208021293797e-06, + "loss": 0.0239, + "step": 18190 + }, + { + "epoch": 1.0905386781712505, + "grad_norm": 0.5620945692062378, + "learning_rate": 8.405870518507681e-06, + "loss": 0.0259, + "step": 18200 + }, + { + "epoch": 1.0911378752471688, + "grad_norm": 0.3430384695529938, + "learning_rate": 8.39653606007701e-06, + "loss": 0.0287, + "step": 18210 + }, + { + "epoch": 1.091737072323087, + "grad_norm": 0.46981969475746155, + "learning_rate": 8.387204656956285e-06, + "loss": 0.0218, + "step": 18220 + }, + { + "epoch": 1.0923362693990053, + "grad_norm": 0.3494231700897217, + "learning_rate": 8.377876320096423e-06, + "loss": 0.0238, + "step": 18230 + }, + { + "epoch": 1.0929354664749236, + "grad_norm": 0.514975368976593, + "learning_rate": 8.368551060444755e-06, + "loss": 0.0205, + "step": 18240 + }, + { + "epoch": 1.0935346635508418, + "grad_norm": 0.6442168951034546, + "learning_rate": 8.359228888944986e-06, + "loss": 0.021, + "step": 18250 + }, + { + "epoch": 1.09413386062676, + "grad_norm": 0.32178881764411926, + "learning_rate": 8.349909816537207e-06, + "loss": 0.0219, + "step": 18260 + }, + { + "epoch": 1.0947330577026784, + "grad_norm": 0.48865941166877747, + "learning_rate": 8.340593854157868e-06, + "loss": 0.0261, + "step": 18270 + }, + { + "epoch": 1.0953322547785966, + "grad_norm": 0.6131434440612793, + "learning_rate": 8.331281012739771e-06, + "loss": 0.0269, + "step": 18280 + }, + { + "epoch": 1.095931451854515, + "grad_norm": 0.4471806585788727, + "learning_rate": 8.32197130321205e-06, + "loss": 0.0251, + "step": 18290 + }, + { + "epoch": 1.0965306489304332, + "grad_norm": 0.8255780935287476, + "learning_rate": 8.312664736500176e-06, + "loss": 0.0229, + "step": 18300 + }, + { + "epoch": 1.0971298460063514, + "grad_norm": 0.843673586845398, + "learning_rate": 8.303361323525916e-06, + "loss": 0.0278, + "step": 18310 + }, + { + "epoch": 1.0977290430822697, + "grad_norm": 0.4278610348701477, + "learning_rate": 8.294061075207343e-06, + "loss": 0.0228, + "step": 18320 + }, + { + "epoch": 1.098328240158188, + "grad_norm": 0.5036011338233948, + "learning_rate": 8.28476400245882e-06, + "loss": 0.0291, + "step": 18330 + }, + { + "epoch": 1.0989274372341062, + "grad_norm": 0.5141382813453674, + "learning_rate": 8.275470116190976e-06, + "loss": 0.0217, + "step": 18340 + }, + { + "epoch": 1.0995266343100245, + "grad_norm": 0.8976346850395203, + "learning_rate": 8.26617942731071e-06, + "loss": 0.0248, + "step": 18350 + }, + { + "epoch": 1.1001258313859428, + "grad_norm": 0.5634751319885254, + "learning_rate": 8.256891946721157e-06, + "loss": 0.0276, + "step": 18360 + }, + { + "epoch": 1.100725028461861, + "grad_norm": 0.5327013731002808, + "learning_rate": 8.247607685321697e-06, + "loss": 0.0279, + "step": 18370 + }, + { + "epoch": 1.1013242255377793, + "grad_norm": 0.2723959982395172, + "learning_rate": 8.238326654007925e-06, + "loss": 0.0225, + "step": 18380 + }, + { + "epoch": 1.1019234226136976, + "grad_norm": 0.4455258846282959, + "learning_rate": 8.229048863671649e-06, + "loss": 0.0222, + "step": 18390 + }, + { + "epoch": 1.1025226196896158, + "grad_norm": 0.3784103989601135, + "learning_rate": 8.219774325200873e-06, + "loss": 0.024, + "step": 18400 + }, + { + "epoch": 1.1031218167655341, + "grad_norm": 0.8102694749832153, + "learning_rate": 8.210503049479787e-06, + "loss": 0.0231, + "step": 18410 + }, + { + "epoch": 1.1037210138414524, + "grad_norm": 0.5179240703582764, + "learning_rate": 8.201235047388747e-06, + "loss": 0.0255, + "step": 18420 + }, + { + "epoch": 1.1043202109173706, + "grad_norm": 0.39830490946769714, + "learning_rate": 8.191970329804269e-06, + "loss": 0.0264, + "step": 18430 + }, + { + "epoch": 1.104919407993289, + "grad_norm": 0.32860279083251953, + "learning_rate": 8.182708907599012e-06, + "loss": 0.0241, + "step": 18440 + }, + { + "epoch": 1.1055186050692072, + "grad_norm": 0.5459582209587097, + "learning_rate": 8.173450791641779e-06, + "loss": 0.0193, + "step": 18450 + }, + { + "epoch": 1.1061178021451255, + "grad_norm": 0.3841477036476135, + "learning_rate": 8.164195992797482e-06, + "loss": 0.0282, + "step": 18460 + }, + { + "epoch": 1.1067169992210437, + "grad_norm": 0.7849119305610657, + "learning_rate": 8.154944521927136e-06, + "loss": 0.0319, + "step": 18470 + }, + { + "epoch": 1.107316196296962, + "grad_norm": 0.4457703232765198, + "learning_rate": 8.145696389887864e-06, + "loss": 0.0279, + "step": 18480 + }, + { + "epoch": 1.1079153933728803, + "grad_norm": 0.30464428663253784, + "learning_rate": 8.136451607532865e-06, + "loss": 0.0184, + "step": 18490 + }, + { + "epoch": 1.1085145904487985, + "grad_norm": 1.0635287761688232, + "learning_rate": 8.127210185711404e-06, + "loss": 0.0265, + "step": 18500 + }, + { + "epoch": 1.1091137875247168, + "grad_norm": 0.33294421434402466, + "learning_rate": 8.117972135268806e-06, + "loss": 0.0235, + "step": 18510 + }, + { + "epoch": 1.109712984600635, + "grad_norm": 0.5644985437393188, + "learning_rate": 8.10873746704643e-06, + "loss": 0.0218, + "step": 18520 + }, + { + "epoch": 1.1103121816765533, + "grad_norm": 0.4975566565990448, + "learning_rate": 8.099506191881683e-06, + "loss": 0.0261, + "step": 18530 + }, + { + "epoch": 1.1109113787524716, + "grad_norm": 0.7503839135169983, + "learning_rate": 8.090278320607975e-06, + "loss": 0.0218, + "step": 18540 + }, + { + "epoch": 1.1115105758283899, + "grad_norm": 0.35363277792930603, + "learning_rate": 8.081053864054731e-06, + "loss": 0.0198, + "step": 18550 + }, + { + "epoch": 1.1121097729043081, + "grad_norm": 0.43968406319618225, + "learning_rate": 8.07183283304736e-06, + "loss": 0.0253, + "step": 18560 + }, + { + "epoch": 1.1127089699802264, + "grad_norm": 0.4553394615650177, + "learning_rate": 8.062615238407261e-06, + "loss": 0.0266, + "step": 18570 + }, + { + "epoch": 1.1133081670561449, + "grad_norm": 0.45489153265953064, + "learning_rate": 8.053401090951787e-06, + "loss": 0.0264, + "step": 18580 + }, + { + "epoch": 1.113907364132063, + "grad_norm": 0.424696147441864, + "learning_rate": 8.044190401494265e-06, + "loss": 0.0209, + "step": 18590 + }, + { + "epoch": 1.1145065612079814, + "grad_norm": 0.4819740653038025, + "learning_rate": 8.03498318084394e-06, + "loss": 0.022, + "step": 18600 + }, + { + "epoch": 1.1151057582838995, + "grad_norm": 0.564834475517273, + "learning_rate": 8.025779439806006e-06, + "loss": 0.024, + "step": 18610 + }, + { + "epoch": 1.115704955359818, + "grad_norm": 0.7905157804489136, + "learning_rate": 8.01657918918156e-06, + "loss": 0.0261, + "step": 18620 + }, + { + "epoch": 1.116304152435736, + "grad_norm": 0.6985124349594116, + "learning_rate": 8.007382439767612e-06, + "loss": 0.0315, + "step": 18630 + }, + { + "epoch": 1.1169033495116545, + "grad_norm": 0.42378291487693787, + "learning_rate": 7.998189202357063e-06, + "loss": 0.0237, + "step": 18640 + }, + { + "epoch": 1.1175025465875728, + "grad_norm": 0.5980759263038635, + "learning_rate": 7.98899948773868e-06, + "loss": 0.0217, + "step": 18650 + }, + { + "epoch": 1.118101743663491, + "grad_norm": 0.45916232466697693, + "learning_rate": 7.979813306697113e-06, + "loss": 0.0235, + "step": 18660 + }, + { + "epoch": 1.1187009407394093, + "grad_norm": 0.25486481189727783, + "learning_rate": 7.970630670012853e-06, + "loss": 0.0231, + "step": 18670 + }, + { + "epoch": 1.1193001378153276, + "grad_norm": 0.4072360694408417, + "learning_rate": 7.961451588462241e-06, + "loss": 0.0261, + "step": 18680 + }, + { + "epoch": 1.1198993348912458, + "grad_norm": 0.3813820481300354, + "learning_rate": 7.952276072817438e-06, + "loss": 0.0209, + "step": 18690 + }, + { + "epoch": 1.120498531967164, + "grad_norm": 0.3040210008621216, + "learning_rate": 7.94310413384642e-06, + "loss": 0.0225, + "step": 18700 + }, + { + "epoch": 1.1210977290430824, + "grad_norm": 0.30910906195640564, + "learning_rate": 7.933935782312965e-06, + "loss": 0.026, + "step": 18710 + }, + { + "epoch": 1.1216969261190006, + "grad_norm": 0.6573566794395447, + "learning_rate": 7.924771028976653e-06, + "loss": 0.0262, + "step": 18720 + }, + { + "epoch": 1.122296123194919, + "grad_norm": 0.30632153153419495, + "learning_rate": 7.915609884592828e-06, + "loss": 0.0251, + "step": 18730 + }, + { + "epoch": 1.1228953202708372, + "grad_norm": 0.3277539610862732, + "learning_rate": 7.9064523599126e-06, + "loss": 0.0233, + "step": 18740 + }, + { + "epoch": 1.1234945173467554, + "grad_norm": 0.49589917063713074, + "learning_rate": 7.897298465682839e-06, + "loss": 0.0211, + "step": 18750 + }, + { + "epoch": 1.1240937144226737, + "grad_norm": 0.4149130880832672, + "learning_rate": 7.888148212646149e-06, + "loss": 0.0203, + "step": 18760 + }, + { + "epoch": 1.124692911498592, + "grad_norm": 0.7051926851272583, + "learning_rate": 7.879001611540864e-06, + "loss": 0.0272, + "step": 18770 + }, + { + "epoch": 1.1252921085745102, + "grad_norm": 0.8553881049156189, + "learning_rate": 7.869858673101027e-06, + "loss": 0.0236, + "step": 18780 + }, + { + "epoch": 1.1258913056504285, + "grad_norm": 0.5676615238189697, + "learning_rate": 7.860719408056385e-06, + "loss": 0.0242, + "step": 18790 + }, + { + "epoch": 1.1264905027263468, + "grad_norm": 0.29548707604408264, + "learning_rate": 7.851583827132372e-06, + "loss": 0.0236, + "step": 18800 + }, + { + "epoch": 1.127089699802265, + "grad_norm": 0.36076608300209045, + "learning_rate": 7.842451941050107e-06, + "loss": 0.0219, + "step": 18810 + }, + { + "epoch": 1.1276888968781833, + "grad_norm": 0.3657922148704529, + "learning_rate": 7.833323760526363e-06, + "loss": 0.0227, + "step": 18820 + }, + { + "epoch": 1.1282880939541016, + "grad_norm": 0.27593615651130676, + "learning_rate": 7.82419929627357e-06, + "loss": 0.0251, + "step": 18830 + }, + { + "epoch": 1.1288872910300198, + "grad_norm": 0.35554730892181396, + "learning_rate": 7.815078558999794e-06, + "loss": 0.0259, + "step": 18840 + }, + { + "epoch": 1.129486488105938, + "grad_norm": 0.45652297139167786, + "learning_rate": 7.80596155940873e-06, + "loss": 0.0274, + "step": 18850 + }, + { + "epoch": 1.1300856851818564, + "grad_norm": 0.5757999420166016, + "learning_rate": 7.796848308199681e-06, + "loss": 0.0222, + "step": 18860 + }, + { + "epoch": 1.1306848822577746, + "grad_norm": 0.5138059854507446, + "learning_rate": 7.787738816067558e-06, + "loss": 0.0216, + "step": 18870 + }, + { + "epoch": 1.131284079333693, + "grad_norm": 0.338874876499176, + "learning_rate": 7.778633093702863e-06, + "loss": 0.0232, + "step": 18880 + }, + { + "epoch": 1.1318832764096112, + "grad_norm": 0.48215195536613464, + "learning_rate": 7.769531151791654e-06, + "loss": 0.0226, + "step": 18890 + }, + { + "epoch": 1.1324824734855294, + "grad_norm": 0.30239933729171753, + "learning_rate": 7.760433001015579e-06, + "loss": 0.0205, + "step": 18900 + }, + { + "epoch": 1.1330816705614477, + "grad_norm": 0.6099343299865723, + "learning_rate": 7.751338652051818e-06, + "loss": 0.0219, + "step": 18910 + }, + { + "epoch": 1.133680867637366, + "grad_norm": 0.6730902791023254, + "learning_rate": 7.742248115573104e-06, + "loss": 0.0239, + "step": 18920 + }, + { + "epoch": 1.1342800647132842, + "grad_norm": 0.4575020968914032, + "learning_rate": 7.733161402247675e-06, + "loss": 0.0204, + "step": 18930 + }, + { + "epoch": 1.1348792617892025, + "grad_norm": 0.2673267424106598, + "learning_rate": 7.724078522739308e-06, + "loss": 0.0222, + "step": 18940 + }, + { + "epoch": 1.1354784588651208, + "grad_norm": 0.3593531548976898, + "learning_rate": 7.714999487707262e-06, + "loss": 0.0225, + "step": 18950 + }, + { + "epoch": 1.136077655941039, + "grad_norm": 0.5385488867759705, + "learning_rate": 7.705924307806294e-06, + "loss": 0.0248, + "step": 18960 + }, + { + "epoch": 1.1366768530169573, + "grad_norm": 0.3900541663169861, + "learning_rate": 7.69685299368663e-06, + "loss": 0.0277, + "step": 18970 + }, + { + "epoch": 1.1372760500928756, + "grad_norm": 0.6182276010513306, + "learning_rate": 7.687785555993958e-06, + "loss": 0.0241, + "step": 18980 + }, + { + "epoch": 1.1378752471687938, + "grad_norm": 0.4897976815700531, + "learning_rate": 7.67872200536943e-06, + "loss": 0.0229, + "step": 18990 + }, + { + "epoch": 1.1384744442447121, + "grad_norm": 0.5717247128486633, + "learning_rate": 7.669662352449623e-06, + "loss": 0.0273, + "step": 19000 + }, + { + "epoch": 1.1390736413206304, + "grad_norm": 0.4837515950202942, + "learning_rate": 7.660606607866543e-06, + "loss": 0.0219, + "step": 19010 + }, + { + "epoch": 1.1396728383965486, + "grad_norm": 0.31954509019851685, + "learning_rate": 7.651554782247613e-06, + "loss": 0.0271, + "step": 19020 + }, + { + "epoch": 1.140272035472467, + "grad_norm": 0.23005163669586182, + "learning_rate": 7.642506886215654e-06, + "loss": 0.0204, + "step": 19030 + }, + { + "epoch": 1.1408712325483852, + "grad_norm": 0.500217616558075, + "learning_rate": 7.633462930388875e-06, + "loss": 0.0229, + "step": 19040 + }, + { + "epoch": 1.1414704296243035, + "grad_norm": 0.47326523065567017, + "learning_rate": 7.624422925380865e-06, + "loss": 0.0203, + "step": 19050 + }, + { + "epoch": 1.1420696267002217, + "grad_norm": 0.5074726939201355, + "learning_rate": 7.615386881800568e-06, + "loss": 0.0249, + "step": 19060 + }, + { + "epoch": 1.14266882377614, + "grad_norm": 0.6583673357963562, + "learning_rate": 7.606354810252295e-06, + "loss": 0.0243, + "step": 19070 + }, + { + "epoch": 1.1432680208520583, + "grad_norm": 0.7585731744766235, + "learning_rate": 7.5973267213356715e-06, + "loss": 0.0264, + "step": 19080 + }, + { + "epoch": 1.1438672179279765, + "grad_norm": 0.3782348036766052, + "learning_rate": 7.588302625645669e-06, + "loss": 0.0216, + "step": 19090 + }, + { + "epoch": 1.1444664150038948, + "grad_norm": 0.43963512778282166, + "learning_rate": 7.57928253377257e-06, + "loss": 0.0201, + "step": 19100 + }, + { + "epoch": 1.145065612079813, + "grad_norm": 0.6450467109680176, + "learning_rate": 7.5702664563019465e-06, + "loss": 0.0254, + "step": 19110 + }, + { + "epoch": 1.1456648091557313, + "grad_norm": 0.3420482575893402, + "learning_rate": 7.561254403814675e-06, + "loss": 0.0224, + "step": 19120 + }, + { + "epoch": 1.1462640062316496, + "grad_norm": 0.3532888889312744, + "learning_rate": 7.552246386886897e-06, + "loss": 0.0216, + "step": 19130 + }, + { + "epoch": 1.1468632033075679, + "grad_norm": 0.32494598627090454, + "learning_rate": 7.543242416090024e-06, + "loss": 0.0196, + "step": 19140 + }, + { + "epoch": 1.1474624003834861, + "grad_norm": 0.2898419499397278, + "learning_rate": 7.534242501990718e-06, + "loss": 0.0234, + "step": 19150 + }, + { + "epoch": 1.1480615974594044, + "grad_norm": 0.4379838705062866, + "learning_rate": 7.525246655150879e-06, + "loss": 0.0233, + "step": 19160 + }, + { + "epoch": 1.1486607945353227, + "grad_norm": 0.5390518307685852, + "learning_rate": 7.516254886127632e-06, + "loss": 0.0169, + "step": 19170 + }, + { + "epoch": 1.149259991611241, + "grad_norm": 0.3786150813102722, + "learning_rate": 7.507267205473318e-06, + "loss": 0.0203, + "step": 19180 + }, + { + "epoch": 1.1498591886871592, + "grad_norm": 0.3376149833202362, + "learning_rate": 7.498283623735485e-06, + "loss": 0.0266, + "step": 19190 + }, + { + "epoch": 1.1504583857630775, + "grad_norm": 0.40810349583625793, + "learning_rate": 7.489304151456867e-06, + "loss": 0.0241, + "step": 19200 + }, + { + "epoch": 1.1510575828389957, + "grad_norm": 0.24485738575458527, + "learning_rate": 7.480328799175369e-06, + "loss": 0.0199, + "step": 19210 + }, + { + "epoch": 1.151656779914914, + "grad_norm": 0.4670563340187073, + "learning_rate": 7.4713575774240695e-06, + "loss": 0.0184, + "step": 19220 + }, + { + "epoch": 1.1522559769908323, + "grad_norm": 0.374255508184433, + "learning_rate": 7.4623904967312e-06, + "loss": 0.018, + "step": 19230 + }, + { + "epoch": 1.1528551740667505, + "grad_norm": 0.4191536605358124, + "learning_rate": 7.453427567620127e-06, + "loss": 0.022, + "step": 19240 + }, + { + "epoch": 1.1534543711426688, + "grad_norm": 0.3807078003883362, + "learning_rate": 7.444468800609348e-06, + "loss": 0.0232, + "step": 19250 + }, + { + "epoch": 1.154053568218587, + "grad_norm": 0.7537381649017334, + "learning_rate": 7.435514206212475e-06, + "loss": 0.0202, + "step": 19260 + }, + { + "epoch": 1.1546527652945053, + "grad_norm": 0.36507129669189453, + "learning_rate": 7.426563794938218e-06, + "loss": 0.0236, + "step": 19270 + }, + { + "epoch": 1.1552519623704236, + "grad_norm": 0.24461498856544495, + "learning_rate": 7.4176175772903905e-06, + "loss": 0.0221, + "step": 19280 + }, + { + "epoch": 1.1558511594463419, + "grad_norm": 0.351654589176178, + "learning_rate": 7.408675563767873e-06, + "loss": 0.0236, + "step": 19290 + }, + { + "epoch": 1.1564503565222601, + "grad_norm": 0.35627686977386475, + "learning_rate": 7.399737764864619e-06, + "loss": 0.0213, + "step": 19300 + }, + { + "epoch": 1.1570495535981784, + "grad_norm": 0.4586603343486786, + "learning_rate": 7.390804191069631e-06, + "loss": 0.0304, + "step": 19310 + }, + { + "epoch": 1.1576487506740967, + "grad_norm": 0.4082098603248596, + "learning_rate": 7.381874852866957e-06, + "loss": 0.0237, + "step": 19320 + }, + { + "epoch": 1.158247947750015, + "grad_norm": 0.47707459330558777, + "learning_rate": 7.37294976073567e-06, + "loss": 0.0247, + "step": 19330 + }, + { + "epoch": 1.1588471448259332, + "grad_norm": 0.4687316119670868, + "learning_rate": 7.364028925149869e-06, + "loss": 0.0344, + "step": 19340 + }, + { + "epoch": 1.1594463419018515, + "grad_norm": 0.4660017788410187, + "learning_rate": 7.3551123565786485e-06, + "loss": 0.0214, + "step": 19350 + }, + { + "epoch": 1.1600455389777697, + "grad_norm": 0.4644101560115814, + "learning_rate": 7.346200065486093e-06, + "loss": 0.022, + "step": 19360 + }, + { + "epoch": 1.160644736053688, + "grad_norm": 0.3139079213142395, + "learning_rate": 7.337292062331278e-06, + "loss": 0.0234, + "step": 19370 + }, + { + "epoch": 1.1612439331296063, + "grad_norm": 0.36445188522338867, + "learning_rate": 7.328388357568239e-06, + "loss": 0.0262, + "step": 19380 + }, + { + "epoch": 1.1618431302055245, + "grad_norm": 0.6457782983779907, + "learning_rate": 7.319488961645973e-06, + "loss": 0.0261, + "step": 19390 + }, + { + "epoch": 1.1624423272814428, + "grad_norm": 0.4184044599533081, + "learning_rate": 7.310593885008412e-06, + "loss": 0.0245, + "step": 19400 + }, + { + "epoch": 1.163041524357361, + "grad_norm": 0.44356703758239746, + "learning_rate": 7.301703138094429e-06, + "loss": 0.0215, + "step": 19410 + }, + { + "epoch": 1.1636407214332793, + "grad_norm": 0.5394402742385864, + "learning_rate": 7.292816731337807e-06, + "loss": 0.0302, + "step": 19420 + }, + { + "epoch": 1.1642399185091976, + "grad_norm": 0.5960429906845093, + "learning_rate": 7.283934675167239e-06, + "loss": 0.0234, + "step": 19430 + }, + { + "epoch": 1.1648391155851159, + "grad_norm": 0.2850514352321625, + "learning_rate": 7.275056980006318e-06, + "loss": 0.0243, + "step": 19440 + }, + { + "epoch": 1.1654383126610341, + "grad_norm": 0.45071718096733093, + "learning_rate": 7.266183656273509e-06, + "loss": 0.0233, + "step": 19450 + }, + { + "epoch": 1.1660375097369524, + "grad_norm": 0.3157344162464142, + "learning_rate": 7.257314714382151e-06, + "loss": 0.0254, + "step": 19460 + }, + { + "epoch": 1.1666367068128707, + "grad_norm": 0.45518410205841064, + "learning_rate": 7.248450164740439e-06, + "loss": 0.024, + "step": 19470 + }, + { + "epoch": 1.167235903888789, + "grad_norm": 0.2323702722787857, + "learning_rate": 7.239590017751423e-06, + "loss": 0.0226, + "step": 19480 + }, + { + "epoch": 1.1678351009647072, + "grad_norm": 0.6025039553642273, + "learning_rate": 7.230734283812979e-06, + "loss": 0.0246, + "step": 19490 + }, + { + "epoch": 1.1684342980406255, + "grad_norm": 0.4983830749988556, + "learning_rate": 7.221882973317795e-06, + "loss": 0.0199, + "step": 19500 + }, + { + "epoch": 1.1690334951165438, + "grad_norm": 0.3684524595737457, + "learning_rate": 7.213036096653383e-06, + "loss": 0.0252, + "step": 19510 + }, + { + "epoch": 1.169632692192462, + "grad_norm": 0.36924007534980774, + "learning_rate": 7.204193664202046e-06, + "loss": 0.0277, + "step": 19520 + }, + { + "epoch": 1.1702318892683803, + "grad_norm": 0.3531496822834015, + "learning_rate": 7.1953556863408725e-06, + "loss": 0.0228, + "step": 19530 + }, + { + "epoch": 1.1708310863442986, + "grad_norm": 0.3995579779148102, + "learning_rate": 7.186522173441719e-06, + "loss": 0.0193, + "step": 19540 + }, + { + "epoch": 1.1714302834202168, + "grad_norm": 0.4124946892261505, + "learning_rate": 7.177693135871202e-06, + "loss": 0.0221, + "step": 19550 + }, + { + "epoch": 1.172029480496135, + "grad_norm": 0.3897329866886139, + "learning_rate": 7.168868583990693e-06, + "loss": 0.0221, + "step": 19560 + }, + { + "epoch": 1.1726286775720534, + "grad_norm": 0.45230787992477417, + "learning_rate": 7.160048528156294e-06, + "loss": 0.0238, + "step": 19570 + }, + { + "epoch": 1.1732278746479716, + "grad_norm": 0.45878538489341736, + "learning_rate": 7.151232978718834e-06, + "loss": 0.0244, + "step": 19580 + }, + { + "epoch": 1.17382707172389, + "grad_norm": 0.4302407503128052, + "learning_rate": 7.1424219460238476e-06, + "loss": 0.0237, + "step": 19590 + }, + { + "epoch": 1.1744262687998082, + "grad_norm": 0.30422642827033997, + "learning_rate": 7.133615440411572e-06, + "loss": 0.0173, + "step": 19600 + }, + { + "epoch": 1.1750254658757266, + "grad_norm": 0.49566513299942017, + "learning_rate": 7.124813472216936e-06, + "loss": 0.0201, + "step": 19610 + }, + { + "epoch": 1.1756246629516447, + "grad_norm": 0.43262094259262085, + "learning_rate": 7.116016051769541e-06, + "loss": 0.0227, + "step": 19620 + }, + { + "epoch": 1.1762238600275632, + "grad_norm": 0.8250450491905212, + "learning_rate": 7.107223189393646e-06, + "loss": 0.0259, + "step": 19630 + }, + { + "epoch": 1.1768230571034812, + "grad_norm": 0.3265332281589508, + "learning_rate": 7.098434895408162e-06, + "loss": 0.0205, + "step": 19640 + }, + { + "epoch": 1.1774222541793997, + "grad_norm": 0.2871774435043335, + "learning_rate": 7.0896511801266446e-06, + "loss": 0.0201, + "step": 19650 + }, + { + "epoch": 1.1780214512553178, + "grad_norm": 0.4341558814048767, + "learning_rate": 7.080872053857273e-06, + "loss": 0.0199, + "step": 19660 + }, + { + "epoch": 1.1786206483312363, + "grad_norm": 0.43365293741226196, + "learning_rate": 7.072097526902846e-06, + "loss": 0.0201, + "step": 19670 + }, + { + "epoch": 1.1792198454071543, + "grad_norm": 0.5876246690750122, + "learning_rate": 7.0633276095607505e-06, + "loss": 0.0205, + "step": 19680 + }, + { + "epoch": 1.1798190424830728, + "grad_norm": 0.2719171643257141, + "learning_rate": 7.054562312122979e-06, + "loss": 0.0211, + "step": 19690 + }, + { + "epoch": 1.1804182395589908, + "grad_norm": 0.38791123032569885, + "learning_rate": 7.045801644876091e-06, + "loss": 0.0244, + "step": 19700 + }, + { + "epoch": 1.1810174366349093, + "grad_norm": 0.4082484543323517, + "learning_rate": 7.037045618101226e-06, + "loss": 0.0206, + "step": 19710 + }, + { + "epoch": 1.1816166337108274, + "grad_norm": 0.5010205507278442, + "learning_rate": 7.028294242074066e-06, + "loss": 0.0245, + "step": 19720 + }, + { + "epoch": 1.1822158307867459, + "grad_norm": 0.4404369294643402, + "learning_rate": 7.0195475270648315e-06, + "loss": 0.0268, + "step": 19730 + }, + { + "epoch": 1.1828150278626641, + "grad_norm": 0.5171347856521606, + "learning_rate": 7.010805483338283e-06, + "loss": 0.024, + "step": 19740 + }, + { + "epoch": 1.1834142249385824, + "grad_norm": 0.5137951970100403, + "learning_rate": 7.0020681211537e-06, + "loss": 0.0241, + "step": 19750 + }, + { + "epoch": 1.1840134220145007, + "grad_norm": 0.563709557056427, + "learning_rate": 6.993335450764864e-06, + "loss": 0.0193, + "step": 19760 + }, + { + "epoch": 1.184612619090419, + "grad_norm": 0.44687238335609436, + "learning_rate": 6.9846074824200435e-06, + "loss": 0.0207, + "step": 19770 + }, + { + "epoch": 1.1852118161663372, + "grad_norm": 0.33815798163414, + "learning_rate": 6.975884226362e-06, + "loss": 0.0246, + "step": 19780 + }, + { + "epoch": 1.1858110132422555, + "grad_norm": 0.33789384365081787, + "learning_rate": 6.967165692827958e-06, + "loss": 0.0206, + "step": 19790 + }, + { + "epoch": 1.1864102103181737, + "grad_norm": 0.38053908944129944, + "learning_rate": 6.958451892049609e-06, + "loss": 0.0195, + "step": 19800 + }, + { + "epoch": 1.187009407394092, + "grad_norm": 0.5730066299438477, + "learning_rate": 6.949742834253074e-06, + "loss": 0.0199, + "step": 19810 + }, + { + "epoch": 1.1876086044700103, + "grad_norm": 0.42453598976135254, + "learning_rate": 6.941038529658924e-06, + "loss": 0.0218, + "step": 19820 + }, + { + "epoch": 1.1882078015459285, + "grad_norm": 0.48010921478271484, + "learning_rate": 6.932338988482141e-06, + "loss": 0.0328, + "step": 19830 + }, + { + "epoch": 1.1888069986218468, + "grad_norm": 0.5227254629135132, + "learning_rate": 6.923644220932124e-06, + "loss": 0.019, + "step": 19840 + }, + { + "epoch": 1.189406195697765, + "grad_norm": 0.4078599810600281, + "learning_rate": 6.914954237212668e-06, + "loss": 0.0212, + "step": 19850 + }, + { + "epoch": 1.1900053927736833, + "grad_norm": 0.4473094046115875, + "learning_rate": 6.906269047521947e-06, + "loss": 0.0281, + "step": 19860 + }, + { + "epoch": 1.1906045898496016, + "grad_norm": 0.3459968864917755, + "learning_rate": 6.8975886620525215e-06, + "loss": 0.0231, + "step": 19870 + }, + { + "epoch": 1.1912037869255199, + "grad_norm": 0.4205886721611023, + "learning_rate": 6.888913090991304e-06, + "loss": 0.0256, + "step": 19880 + }, + { + "epoch": 1.1918029840014381, + "grad_norm": 0.5397320985794067, + "learning_rate": 6.880242344519564e-06, + "loss": 0.0214, + "step": 19890 + }, + { + "epoch": 1.1924021810773564, + "grad_norm": 0.6208626627922058, + "learning_rate": 6.871576432812899e-06, + "loss": 0.0224, + "step": 19900 + }, + { + "epoch": 1.1930013781532747, + "grad_norm": 0.34377506375312805, + "learning_rate": 6.862915366041247e-06, + "loss": 0.0197, + "step": 19910 + }, + { + "epoch": 1.193600575229193, + "grad_norm": 0.4086950123310089, + "learning_rate": 6.854259154368844e-06, + "loss": 0.0202, + "step": 19920 + }, + { + "epoch": 1.1941997723051112, + "grad_norm": 0.5211176872253418, + "learning_rate": 6.845607807954242e-06, + "loss": 0.0201, + "step": 19930 + }, + { + "epoch": 1.1947989693810295, + "grad_norm": 0.3705415725708008, + "learning_rate": 6.836961336950279e-06, + "loss": 0.0219, + "step": 19940 + }, + { + "epoch": 1.1953981664569477, + "grad_norm": 0.32692769169807434, + "learning_rate": 6.828319751504063e-06, + "loss": 0.0204, + "step": 19950 + }, + { + "epoch": 1.195997363532866, + "grad_norm": 0.42599135637283325, + "learning_rate": 6.819683061756983e-06, + "loss": 0.0213, + "step": 19960 + }, + { + "epoch": 1.1965965606087843, + "grad_norm": 0.565449595451355, + "learning_rate": 6.811051277844672e-06, + "loss": 0.0223, + "step": 19970 + }, + { + "epoch": 1.1971957576847025, + "grad_norm": 0.4027825593948364, + "learning_rate": 6.802424409897012e-06, + "loss": 0.0233, + "step": 19980 + }, + { + "epoch": 1.1977949547606208, + "grad_norm": 0.4833034574985504, + "learning_rate": 6.793802468038111e-06, + "loss": 0.0309, + "step": 19990 + }, + { + "epoch": 1.198394151836539, + "grad_norm": 0.5570312738418579, + "learning_rate": 6.785185462386297e-06, + "loss": 0.0213, + "step": 20000 + }, + { + "epoch": 1.1989933489124573, + "grad_norm": 0.30241742730140686, + "learning_rate": 6.776573403054111e-06, + "loss": 0.0197, + "step": 20010 + }, + { + "epoch": 1.1995925459883756, + "grad_norm": 0.37468239665031433, + "learning_rate": 6.767966300148277e-06, + "loss": 0.0214, + "step": 20020 + }, + { + "epoch": 1.2001917430642939, + "grad_norm": 0.5555301904678345, + "learning_rate": 6.759364163769717e-06, + "loss": 0.0223, + "step": 20030 + }, + { + "epoch": 1.2007909401402121, + "grad_norm": 0.6084730625152588, + "learning_rate": 6.750767004013511e-06, + "loss": 0.0261, + "step": 20040 + }, + { + "epoch": 1.2013901372161304, + "grad_norm": 0.5931955575942993, + "learning_rate": 6.742174830968907e-06, + "loss": 0.0237, + "step": 20050 + }, + { + "epoch": 1.2019893342920487, + "grad_norm": 0.30350545048713684, + "learning_rate": 6.733587654719298e-06, + "loss": 0.02, + "step": 20060 + }, + { + "epoch": 1.202588531367967, + "grad_norm": 0.6784055233001709, + "learning_rate": 6.725005485342219e-06, + "loss": 0.0281, + "step": 20070 + }, + { + "epoch": 1.2031877284438852, + "grad_norm": 0.5559973120689392, + "learning_rate": 6.716428332909318e-06, + "loss": 0.0204, + "step": 20080 + }, + { + "epoch": 1.2037869255198035, + "grad_norm": 0.7529487013816833, + "learning_rate": 6.707856207486361e-06, + "loss": 0.0235, + "step": 20090 + }, + { + "epoch": 1.2043861225957218, + "grad_norm": 0.7032052874565125, + "learning_rate": 6.69928911913322e-06, + "loss": 0.0176, + "step": 20100 + }, + { + "epoch": 1.20498531967164, + "grad_norm": 0.5018401741981506, + "learning_rate": 6.690727077903843e-06, + "loss": 0.0197, + "step": 20110 + }, + { + "epoch": 1.2055845167475583, + "grad_norm": 0.5020368695259094, + "learning_rate": 6.68217009384627e-06, + "loss": 0.0231, + "step": 20120 + }, + { + "epoch": 1.2061837138234766, + "grad_norm": 0.3605690598487854, + "learning_rate": 6.6736181770025895e-06, + "loss": 0.0254, + "step": 20130 + }, + { + "epoch": 1.2067829108993948, + "grad_norm": 0.3482762575149536, + "learning_rate": 6.665071337408959e-06, + "loss": 0.0223, + "step": 20140 + }, + { + "epoch": 1.207382107975313, + "grad_norm": 0.4260469675064087, + "learning_rate": 6.656529585095568e-06, + "loss": 0.0199, + "step": 20150 + }, + { + "epoch": 1.2079813050512314, + "grad_norm": 0.23622000217437744, + "learning_rate": 6.647992930086644e-06, + "loss": 0.0239, + "step": 20160 + }, + { + "epoch": 1.2085805021271496, + "grad_norm": 0.3683573007583618, + "learning_rate": 6.639461382400419e-06, + "loss": 0.0223, + "step": 20170 + }, + { + "epoch": 1.209179699203068, + "grad_norm": 0.32972025871276855, + "learning_rate": 6.630934952049143e-06, + "loss": 0.0228, + "step": 20180 + }, + { + "epoch": 1.2097788962789862, + "grad_norm": 0.4159783124923706, + "learning_rate": 6.62241364903906e-06, + "loss": 0.0221, + "step": 20190 + }, + { + "epoch": 1.2103780933549044, + "grad_norm": 0.24288412928581238, + "learning_rate": 6.613897483370389e-06, + "loss": 0.0188, + "step": 20200 + }, + { + "epoch": 1.2109772904308227, + "grad_norm": 0.42375463247299194, + "learning_rate": 6.6053864650373286e-06, + "loss": 0.0183, + "step": 20210 + }, + { + "epoch": 1.211576487506741, + "grad_norm": 0.26672226190567017, + "learning_rate": 6.596880604028027e-06, + "loss": 0.02, + "step": 20220 + }, + { + "epoch": 1.2121756845826592, + "grad_norm": 0.30816635489463806, + "learning_rate": 6.588379910324592e-06, + "loss": 0.0219, + "step": 20230 + }, + { + "epoch": 1.2127748816585775, + "grad_norm": 0.315452516078949, + "learning_rate": 6.579884393903056e-06, + "loss": 0.0218, + "step": 20240 + }, + { + "epoch": 1.2133740787344958, + "grad_norm": 0.5412175059318542, + "learning_rate": 6.571394064733388e-06, + "loss": 0.0233, + "step": 20250 + }, + { + "epoch": 1.213973275810414, + "grad_norm": 0.4290241003036499, + "learning_rate": 6.562908932779455e-06, + "loss": 0.0233, + "step": 20260 + }, + { + "epoch": 1.2145724728863323, + "grad_norm": 0.3977762460708618, + "learning_rate": 6.554429007999035e-06, + "loss": 0.0239, + "step": 20270 + }, + { + "epoch": 1.2151716699622506, + "grad_norm": 0.4023628532886505, + "learning_rate": 6.545954300343791e-06, + "loss": 0.0197, + "step": 20280 + }, + { + "epoch": 1.2157708670381688, + "grad_norm": 0.8707197308540344, + "learning_rate": 6.53748481975927e-06, + "loss": 0.029, + "step": 20290 + }, + { + "epoch": 1.216370064114087, + "grad_norm": 0.37878328561782837, + "learning_rate": 6.529020576184872e-06, + "loss": 0.0218, + "step": 20300 + }, + { + "epoch": 1.2169692611900054, + "grad_norm": 0.685556173324585, + "learning_rate": 6.520561579553859e-06, + "loss": 0.0248, + "step": 20310 + }, + { + "epoch": 1.2175684582659236, + "grad_norm": 0.5783588886260986, + "learning_rate": 6.512107839793337e-06, + "loss": 0.02, + "step": 20320 + }, + { + "epoch": 1.218167655341842, + "grad_norm": 0.5456825494766235, + "learning_rate": 6.503659366824239e-06, + "loss": 0.0279, + "step": 20330 + }, + { + "epoch": 1.2187668524177602, + "grad_norm": 0.6162738800048828, + "learning_rate": 6.495216170561325e-06, + "loss": 0.0259, + "step": 20340 + }, + { + "epoch": 1.2193660494936784, + "grad_norm": 0.38887348771095276, + "learning_rate": 6.4867782609131445e-06, + "loss": 0.0198, + "step": 20350 + }, + { + "epoch": 1.2199652465695967, + "grad_norm": 0.5207514762878418, + "learning_rate": 6.4783456477820625e-06, + "loss": 0.0201, + "step": 20360 + }, + { + "epoch": 1.220564443645515, + "grad_norm": 0.671120822429657, + "learning_rate": 6.469918341064219e-06, + "loss": 0.0259, + "step": 20370 + }, + { + "epoch": 1.2211636407214332, + "grad_norm": 0.28870952129364014, + "learning_rate": 6.461496350649529e-06, + "loss": 0.0175, + "step": 20380 + }, + { + "epoch": 1.2217628377973515, + "grad_norm": 0.3909374177455902, + "learning_rate": 6.453079686421665e-06, + "loss": 0.0214, + "step": 20390 + }, + { + "epoch": 1.2223620348732698, + "grad_norm": 0.3419650197029114, + "learning_rate": 6.4446683582580495e-06, + "loss": 0.0217, + "step": 20400 + }, + { + "epoch": 1.222961231949188, + "grad_norm": 0.563515305519104, + "learning_rate": 6.436262376029847e-06, + "loss": 0.0185, + "step": 20410 + }, + { + "epoch": 1.2235604290251063, + "grad_norm": 0.6295453310012817, + "learning_rate": 6.427861749601945e-06, + "loss": 0.023, + "step": 20420 + }, + { + "epoch": 1.2241596261010246, + "grad_norm": 0.4404713213443756, + "learning_rate": 6.4194664888329515e-06, + "loss": 0.0188, + "step": 20430 + }, + { + "epoch": 1.2247588231769428, + "grad_norm": 0.698448121547699, + "learning_rate": 6.411076603575166e-06, + "loss": 0.0225, + "step": 20440 + }, + { + "epoch": 1.2253580202528611, + "grad_norm": 0.5679222941398621, + "learning_rate": 6.402692103674587e-06, + "loss": 0.0213, + "step": 20450 + }, + { + "epoch": 1.2259572173287794, + "grad_norm": 0.5237470269203186, + "learning_rate": 6.394312998970895e-06, + "loss": 0.0261, + "step": 20460 + }, + { + "epoch": 1.2265564144046976, + "grad_norm": 0.4205586016178131, + "learning_rate": 6.385939299297437e-06, + "loss": 0.0232, + "step": 20470 + }, + { + "epoch": 1.227155611480616, + "grad_norm": 0.36608314514160156, + "learning_rate": 6.3775710144812145e-06, + "loss": 0.02, + "step": 20480 + }, + { + "epoch": 1.2277548085565342, + "grad_norm": 0.49511757493019104, + "learning_rate": 6.369208154342872e-06, + "loss": 0.0247, + "step": 20490 + }, + { + "epoch": 1.2283540056324525, + "grad_norm": 0.3475521206855774, + "learning_rate": 6.360850728696695e-06, + "loss": 0.0202, + "step": 20500 + }, + { + "epoch": 1.2289532027083707, + "grad_norm": 0.36345914006233215, + "learning_rate": 6.3524987473505865e-06, + "loss": 0.0197, + "step": 20510 + }, + { + "epoch": 1.229552399784289, + "grad_norm": 0.34304162859916687, + "learning_rate": 6.344152220106068e-06, + "loss": 0.0183, + "step": 20520 + }, + { + "epoch": 1.2301515968602073, + "grad_norm": 0.41459065675735474, + "learning_rate": 6.335811156758245e-06, + "loss": 0.02, + "step": 20530 + }, + { + "epoch": 1.2307507939361255, + "grad_norm": 0.34139952063560486, + "learning_rate": 6.327475567095824e-06, + "loss": 0.0211, + "step": 20540 + }, + { + "epoch": 1.2313499910120438, + "grad_norm": 0.29463231563568115, + "learning_rate": 6.319145460901086e-06, + "loss": 0.0225, + "step": 20550 + }, + { + "epoch": 1.231949188087962, + "grad_norm": 0.37984198331832886, + "learning_rate": 6.310820847949874e-06, + "loss": 0.0201, + "step": 20560 + }, + { + "epoch": 1.2325483851638803, + "grad_norm": 0.21912901103496552, + "learning_rate": 6.3025017380115836e-06, + "loss": 0.0226, + "step": 20570 + }, + { + "epoch": 1.2331475822397986, + "grad_norm": 0.34660178422927856, + "learning_rate": 6.294188140849153e-06, + "loss": 0.0179, + "step": 20580 + }, + { + "epoch": 1.2337467793157169, + "grad_norm": 0.6080809235572815, + "learning_rate": 6.285880066219049e-06, + "loss": 0.0187, + "step": 20590 + }, + { + "epoch": 1.2343459763916351, + "grad_norm": 0.43388310074806213, + "learning_rate": 6.277577523871268e-06, + "loss": 0.0226, + "step": 20600 + }, + { + "epoch": 1.2349451734675534, + "grad_norm": 0.53389972448349, + "learning_rate": 6.269280523549298e-06, + "loss": 0.0237, + "step": 20610 + }, + { + "epoch": 1.2355443705434717, + "grad_norm": 0.39731428027153015, + "learning_rate": 6.260989074990134e-06, + "loss": 0.0176, + "step": 20620 + }, + { + "epoch": 1.23614356761939, + "grad_norm": 0.32715681195259094, + "learning_rate": 6.252703187924252e-06, + "loss": 0.0211, + "step": 20630 + }, + { + "epoch": 1.2367427646953082, + "grad_norm": 0.36709150671958923, + "learning_rate": 6.244422872075602e-06, + "loss": 0.0194, + "step": 20640 + }, + { + "epoch": 1.2373419617712265, + "grad_norm": 0.5554866790771484, + "learning_rate": 6.236148137161602e-06, + "loss": 0.0202, + "step": 20650 + }, + { + "epoch": 1.2379411588471447, + "grad_norm": 0.26253199577331543, + "learning_rate": 6.227878992893104e-06, + "loss": 0.02, + "step": 20660 + }, + { + "epoch": 1.238540355923063, + "grad_norm": 0.3686104714870453, + "learning_rate": 6.219615448974419e-06, + "loss": 0.0191, + "step": 20670 + }, + { + "epoch": 1.2391395529989815, + "grad_norm": 0.36151114106178284, + "learning_rate": 6.211357515103266e-06, + "loss": 0.0213, + "step": 20680 + }, + { + "epoch": 1.2397387500748995, + "grad_norm": 0.5019435882568359, + "learning_rate": 6.203105200970801e-06, + "loss": 0.0203, + "step": 20690 + }, + { + "epoch": 1.240337947150818, + "grad_norm": 1.1914043426513672, + "learning_rate": 6.194858516261565e-06, + "loss": 0.0249, + "step": 20700 + }, + { + "epoch": 1.240937144226736, + "grad_norm": 0.45042529702186584, + "learning_rate": 6.1866174706535065e-06, + "loss": 0.0244, + "step": 20710 + }, + { + "epoch": 1.2415363413026546, + "grad_norm": 0.3239169120788574, + "learning_rate": 6.17838207381795e-06, + "loss": 0.0219, + "step": 20720 + }, + { + "epoch": 1.2421355383785726, + "grad_norm": 0.3253174424171448, + "learning_rate": 6.170152335419598e-06, + "loss": 0.0226, + "step": 20730 + }, + { + "epoch": 1.242734735454491, + "grad_norm": 0.6497724652290344, + "learning_rate": 6.161928265116497e-06, + "loss": 0.0238, + "step": 20740 + }, + { + "epoch": 1.2433339325304091, + "grad_norm": 0.5800855159759521, + "learning_rate": 6.1537098725600585e-06, + "loss": 0.0211, + "step": 20750 + }, + { + "epoch": 1.2439331296063276, + "grad_norm": 0.29717954993247986, + "learning_rate": 6.145497167395025e-06, + "loss": 0.0198, + "step": 20760 + }, + { + "epoch": 1.2445323266822457, + "grad_norm": 0.35056066513061523, + "learning_rate": 6.137290159259457e-06, + "loss": 0.0219, + "step": 20770 + }, + { + "epoch": 1.2451315237581642, + "grad_norm": 0.28448906540870667, + "learning_rate": 6.129088857784744e-06, + "loss": 0.0227, + "step": 20780 + }, + { + "epoch": 1.2457307208340822, + "grad_norm": 0.33300310373306274, + "learning_rate": 6.12089327259556e-06, + "loss": 0.0165, + "step": 20790 + }, + { + "epoch": 1.2463299179100007, + "grad_norm": 0.5134487748146057, + "learning_rate": 6.112703413309888e-06, + "loss": 0.0219, + "step": 20800 + }, + { + "epoch": 1.246929114985919, + "grad_norm": 0.45153549313545227, + "learning_rate": 6.104519289538983e-06, + "loss": 0.0191, + "step": 20810 + }, + { + "epoch": 1.2475283120618372, + "grad_norm": 0.6483689546585083, + "learning_rate": 6.09634091088737e-06, + "loss": 0.0211, + "step": 20820 + }, + { + "epoch": 1.2481275091377555, + "grad_norm": 0.5660327076911926, + "learning_rate": 6.08816828695283e-06, + "loss": 0.0207, + "step": 20830 + }, + { + "epoch": 1.2487267062136738, + "grad_norm": 0.6027820706367493, + "learning_rate": 6.080001427326393e-06, + "loss": 0.0201, + "step": 20840 + }, + { + "epoch": 1.249325903289592, + "grad_norm": 0.6102983951568604, + "learning_rate": 6.071840341592327e-06, + "loss": 0.0207, + "step": 20850 + }, + { + "epoch": 1.2499251003655103, + "grad_norm": 0.4383072257041931, + "learning_rate": 6.063685039328116e-06, + "loss": 0.0275, + "step": 20860 + }, + { + "epoch": 1.2505242974414286, + "grad_norm": 0.42298370599746704, + "learning_rate": 6.055535530104466e-06, + "loss": 0.0204, + "step": 20870 + }, + { + "epoch": 1.2511234945173468, + "grad_norm": 0.30508092045783997, + "learning_rate": 6.047391823485273e-06, + "loss": 0.0195, + "step": 20880 + }, + { + "epoch": 1.251722691593265, + "grad_norm": 0.6242369413375854, + "learning_rate": 6.039253929027638e-06, + "loss": 0.0215, + "step": 20890 + }, + { + "epoch": 1.2523218886691834, + "grad_norm": 0.38399502635002136, + "learning_rate": 6.031121856281828e-06, + "loss": 0.0201, + "step": 20900 + }, + { + "epoch": 1.2529210857451016, + "grad_norm": 0.4721924066543579, + "learning_rate": 6.022995614791288e-06, + "loss": 0.0243, + "step": 20910 + }, + { + "epoch": 1.25352028282102, + "grad_norm": 0.6958035230636597, + "learning_rate": 6.01487521409261e-06, + "loss": 0.0201, + "step": 20920 + }, + { + "epoch": 1.2541194798969382, + "grad_norm": 0.3826717436313629, + "learning_rate": 6.0067606637155395e-06, + "loss": 0.0236, + "step": 20930 + }, + { + "epoch": 1.2547186769728564, + "grad_norm": 0.3098534941673279, + "learning_rate": 5.998651973182953e-06, + "loss": 0.0216, + "step": 20940 + }, + { + "epoch": 1.2553178740487747, + "grad_norm": 0.43973061442375183, + "learning_rate": 5.990549152010853e-06, + "loss": 0.0234, + "step": 20950 + }, + { + "epoch": 1.255917071124693, + "grad_norm": 0.46570682525634766, + "learning_rate": 5.98245220970835e-06, + "loss": 0.0226, + "step": 20960 + }, + { + "epoch": 1.2565162682006112, + "grad_norm": 0.46847036480903625, + "learning_rate": 5.9743611557776505e-06, + "loss": 0.0188, + "step": 20970 + }, + { + "epoch": 1.2571154652765295, + "grad_norm": 0.5139725804328918, + "learning_rate": 5.966275999714063e-06, + "loss": 0.0195, + "step": 20980 + }, + { + "epoch": 1.2577146623524478, + "grad_norm": 0.48436662554740906, + "learning_rate": 5.958196751005967e-06, + "loss": 0.0206, + "step": 20990 + }, + { + "epoch": 1.258313859428366, + "grad_norm": 0.3445553481578827, + "learning_rate": 5.950123419134817e-06, + "loss": 0.0241, + "step": 21000 + }, + { + "epoch": 1.2589130565042843, + "grad_norm": 0.8473356366157532, + "learning_rate": 5.942056013575106e-06, + "loss": 0.0248, + "step": 21010 + }, + { + "epoch": 1.2595122535802026, + "grad_norm": 0.6241415143013, + "learning_rate": 5.933994543794391e-06, + "loss": 0.0242, + "step": 21020 + }, + { + "epoch": 1.2601114506561208, + "grad_norm": 0.7302873730659485, + "learning_rate": 5.925939019253255e-06, + "loss": 0.0224, + "step": 21030 + }, + { + "epoch": 1.2607106477320391, + "grad_norm": 0.29269692301750183, + "learning_rate": 5.9178894494053085e-06, + "loss": 0.0181, + "step": 21040 + }, + { + "epoch": 1.2613098448079574, + "grad_norm": 0.4065910577774048, + "learning_rate": 5.909845843697164e-06, + "loss": 0.0253, + "step": 21050 + }, + { + "epoch": 1.2619090418838756, + "grad_norm": 0.36930134892463684, + "learning_rate": 5.901808211568441e-06, + "loss": 0.0203, + "step": 21060 + }, + { + "epoch": 1.262508238959794, + "grad_norm": 0.5521696209907532, + "learning_rate": 5.8937765624517495e-06, + "loss": 0.0208, + "step": 21070 + }, + { + "epoch": 1.2631074360357122, + "grad_norm": 0.3761119544506073, + "learning_rate": 5.885750905772678e-06, + "loss": 0.0209, + "step": 21080 + }, + { + "epoch": 1.2637066331116305, + "grad_norm": 0.3330603241920471, + "learning_rate": 5.877731250949785e-06, + "loss": 0.0233, + "step": 21090 + }, + { + "epoch": 1.2643058301875487, + "grad_norm": 0.27771884202957153, + "learning_rate": 5.869717607394576e-06, + "loss": 0.0162, + "step": 21100 + }, + { + "epoch": 1.264905027263467, + "grad_norm": 0.4225069284439087, + "learning_rate": 5.86170998451151e-06, + "loss": 0.0177, + "step": 21110 + }, + { + "epoch": 1.2655042243393853, + "grad_norm": 0.33680275082588196, + "learning_rate": 5.8537083916979806e-06, + "loss": 0.0199, + "step": 21120 + }, + { + "epoch": 1.2661034214153035, + "grad_norm": 0.4399181604385376, + "learning_rate": 5.845712838344304e-06, + "loss": 0.0236, + "step": 21130 + }, + { + "epoch": 1.2667026184912218, + "grad_norm": 0.49677175283432007, + "learning_rate": 5.837723333833704e-06, + "loss": 0.0265, + "step": 21140 + }, + { + "epoch": 1.26730181556714, + "grad_norm": 0.39700835943222046, + "learning_rate": 5.829739887542306e-06, + "loss": 0.0193, + "step": 21150 + }, + { + "epoch": 1.2679010126430583, + "grad_norm": 0.4604041278362274, + "learning_rate": 5.821762508839135e-06, + "loss": 0.0208, + "step": 21160 + }, + { + "epoch": 1.2685002097189766, + "grad_norm": 0.26002946496009827, + "learning_rate": 5.813791207086085e-06, + "loss": 0.0197, + "step": 21170 + }, + { + "epoch": 1.2690994067948949, + "grad_norm": 0.3256632685661316, + "learning_rate": 5.805825991637928e-06, + "loss": 0.0192, + "step": 21180 + }, + { + "epoch": 1.2696986038708131, + "grad_norm": 0.3573099672794342, + "learning_rate": 5.7978668718422826e-06, + "loss": 0.0184, + "step": 21190 + }, + { + "epoch": 1.2702978009467314, + "grad_norm": 0.3116256892681122, + "learning_rate": 5.78991385703962e-06, + "loss": 0.0197, + "step": 21200 + }, + { + "epoch": 1.2708969980226497, + "grad_norm": 0.39247608184814453, + "learning_rate": 5.781966956563247e-06, + "loss": 0.0219, + "step": 21210 + }, + { + "epoch": 1.271496195098568, + "grad_norm": 0.31291085481643677, + "learning_rate": 5.774026179739299e-06, + "loss": 0.0194, + "step": 21220 + }, + { + "epoch": 1.2720953921744862, + "grad_norm": 0.5996116399765015, + "learning_rate": 5.766091535886716e-06, + "loss": 0.0264, + "step": 21230 + }, + { + "epoch": 1.2726945892504045, + "grad_norm": 0.24854864180088043, + "learning_rate": 5.7581630343172364e-06, + "loss": 0.0207, + "step": 21240 + }, + { + "epoch": 1.2732937863263227, + "grad_norm": 0.5746667385101318, + "learning_rate": 5.750240684335408e-06, + "loss": 0.0195, + "step": 21250 + }, + { + "epoch": 1.273892983402241, + "grad_norm": 0.5744135975837708, + "learning_rate": 5.742324495238548e-06, + "loss": 0.0182, + "step": 21260 + }, + { + "epoch": 1.2744921804781593, + "grad_norm": 0.5161272883415222, + "learning_rate": 5.734414476316747e-06, + "loss": 0.0212, + "step": 21270 + }, + { + "epoch": 1.2750913775540775, + "grad_norm": 0.5889247059822083, + "learning_rate": 5.726510636852848e-06, + "loss": 0.0172, + "step": 21280 + }, + { + "epoch": 1.2756905746299958, + "grad_norm": 0.53412926197052, + "learning_rate": 5.71861298612245e-06, + "loss": 0.0209, + "step": 21290 + }, + { + "epoch": 1.276289771705914, + "grad_norm": 0.3421672582626343, + "learning_rate": 5.7107215333938825e-06, + "loss": 0.0193, + "step": 21300 + }, + { + "epoch": 1.2768889687818323, + "grad_norm": 0.409906268119812, + "learning_rate": 5.7028362879282125e-06, + "loss": 0.0173, + "step": 21310 + }, + { + "epoch": 1.2774881658577506, + "grad_norm": 0.5139239430427551, + "learning_rate": 5.694957258979211e-06, + "loss": 0.0198, + "step": 21320 + }, + { + "epoch": 1.2780873629336689, + "grad_norm": 0.5014253258705139, + "learning_rate": 5.6870844557933524e-06, + "loss": 0.0177, + "step": 21330 + }, + { + "epoch": 1.2786865600095871, + "grad_norm": 0.5942979454994202, + "learning_rate": 5.679217887609813e-06, + "loss": 0.0206, + "step": 21340 + }, + { + "epoch": 1.2792857570855054, + "grad_norm": 0.218281552195549, + "learning_rate": 5.671357563660449e-06, + "loss": 0.0204, + "step": 21350 + }, + { + "epoch": 1.2798849541614237, + "grad_norm": 0.43725427985191345, + "learning_rate": 5.663503493169793e-06, + "loss": 0.0215, + "step": 21360 + }, + { + "epoch": 1.280484151237342, + "grad_norm": 0.3467969000339508, + "learning_rate": 5.655655685355026e-06, + "loss": 0.0168, + "step": 21370 + }, + { + "epoch": 1.2810833483132602, + "grad_norm": 0.2697127163410187, + "learning_rate": 5.647814149425992e-06, + "loss": 0.0214, + "step": 21380 + }, + { + "epoch": 1.2816825453891785, + "grad_norm": 0.43687018752098083, + "learning_rate": 5.639978894585169e-06, + "loss": 0.0262, + "step": 21390 + }, + { + "epoch": 1.2822817424650967, + "grad_norm": 0.47759339213371277, + "learning_rate": 5.63214993002767e-06, + "loss": 0.0212, + "step": 21400 + }, + { + "epoch": 1.282880939541015, + "grad_norm": 0.33211249113082886, + "learning_rate": 5.6243272649412115e-06, + "loss": 0.0228, + "step": 21410 + }, + { + "epoch": 1.2834801366169333, + "grad_norm": 0.29453045129776, + "learning_rate": 5.6165109085061374e-06, + "loss": 0.0233, + "step": 21420 + }, + { + "epoch": 1.2840793336928515, + "grad_norm": 0.34539318084716797, + "learning_rate": 5.608700869895367e-06, + "loss": 0.021, + "step": 21430 + }, + { + "epoch": 1.2846785307687698, + "grad_norm": 0.6664339900016785, + "learning_rate": 5.600897158274421e-06, + "loss": 0.0203, + "step": 21440 + }, + { + "epoch": 1.285277727844688, + "grad_norm": 0.21404555439949036, + "learning_rate": 5.593099782801392e-06, + "loss": 0.0209, + "step": 21450 + }, + { + "epoch": 1.2858769249206063, + "grad_norm": 0.4320753812789917, + "learning_rate": 5.585308752626929e-06, + "loss": 0.0236, + "step": 21460 + }, + { + "epoch": 1.2864761219965246, + "grad_norm": 0.415399968624115, + "learning_rate": 5.5775240768942415e-06, + "loss": 0.0235, + "step": 21470 + }, + { + "epoch": 1.2870753190724429, + "grad_norm": 0.2643829584121704, + "learning_rate": 5.569745764739082e-06, + "loss": 0.0203, + "step": 21480 + }, + { + "epoch": 1.2876745161483611, + "grad_norm": 0.4354988932609558, + "learning_rate": 5.561973825289734e-06, + "loss": 0.0172, + "step": 21490 + }, + { + "epoch": 1.2882737132242794, + "grad_norm": 0.43992263078689575, + "learning_rate": 5.554208267666996e-06, + "loss": 0.018, + "step": 21500 + }, + { + "epoch": 1.2888729103001977, + "grad_norm": 0.32208460569381714, + "learning_rate": 5.54644910098419e-06, + "loss": 0.0183, + "step": 21510 + }, + { + "epoch": 1.289472107376116, + "grad_norm": 0.27261701226234436, + "learning_rate": 5.538696334347122e-06, + "loss": 0.0196, + "step": 21520 + }, + { + "epoch": 1.2900713044520342, + "grad_norm": 0.4348963499069214, + "learning_rate": 5.5309499768541005e-06, + "loss": 0.0173, + "step": 21530 + }, + { + "epoch": 1.2906705015279525, + "grad_norm": 0.40379852056503296, + "learning_rate": 5.5232100375959095e-06, + "loss": 0.0202, + "step": 21540 + }, + { + "epoch": 1.2912696986038708, + "grad_norm": 0.4592876136302948, + "learning_rate": 5.515476525655792e-06, + "loss": 0.0219, + "step": 21550 + }, + { + "epoch": 1.291868895679789, + "grad_norm": 0.4797484278678894, + "learning_rate": 5.50774945010946e-06, + "loss": 0.0182, + "step": 21560 + }, + { + "epoch": 1.2924680927557073, + "grad_norm": 0.47892817854881287, + "learning_rate": 5.500028820025065e-06, + "loss": 0.0185, + "step": 21570 + }, + { + "epoch": 1.2930672898316256, + "grad_norm": 0.46308979392051697, + "learning_rate": 5.492314644463202e-06, + "loss": 0.018, + "step": 21580 + }, + { + "epoch": 1.2936664869075438, + "grad_norm": 0.7745133638381958, + "learning_rate": 5.484606932476875e-06, + "loss": 0.0207, + "step": 21590 + }, + { + "epoch": 1.294265683983462, + "grad_norm": 0.6577957272529602, + "learning_rate": 5.476905693111521e-06, + "loss": 0.0166, + "step": 21600 + }, + { + "epoch": 1.2948648810593804, + "grad_norm": 0.43036580085754395, + "learning_rate": 5.4692109354049745e-06, + "loss": 0.0218, + "step": 21610 + }, + { + "epoch": 1.2954640781352986, + "grad_norm": 0.41811347007751465, + "learning_rate": 5.461522668387456e-06, + "loss": 0.0214, + "step": 21620 + }, + { + "epoch": 1.296063275211217, + "grad_norm": 0.31980884075164795, + "learning_rate": 5.453840901081584e-06, + "loss": 0.0198, + "step": 21630 + }, + { + "epoch": 1.2966624722871352, + "grad_norm": 0.3632652461528778, + "learning_rate": 5.4461656425023305e-06, + "loss": 0.0209, + "step": 21640 + }, + { + "epoch": 1.2972616693630534, + "grad_norm": 0.467146635055542, + "learning_rate": 5.438496901657042e-06, + "loss": 0.0173, + "step": 21650 + }, + { + "epoch": 1.2978608664389717, + "grad_norm": 0.5659807920455933, + "learning_rate": 5.430834687545416e-06, + "loss": 0.0199, + "step": 21660 + }, + { + "epoch": 1.2984600635148902, + "grad_norm": 0.24540813267230988, + "learning_rate": 5.423179009159489e-06, + "loss": 0.0178, + "step": 21670 + }, + { + "epoch": 1.2990592605908082, + "grad_norm": 0.3122001588344574, + "learning_rate": 5.4155298754836195e-06, + "loss": 0.0222, + "step": 21680 + }, + { + "epoch": 1.2996584576667267, + "grad_norm": 0.2879388928413391, + "learning_rate": 5.407887295494495e-06, + "loss": 0.0173, + "step": 21690 + }, + { + "epoch": 1.3002576547426448, + "grad_norm": 0.5185259580612183, + "learning_rate": 5.400251278161113e-06, + "loss": 0.0168, + "step": 21700 + }, + { + "epoch": 1.3008568518185633, + "grad_norm": 0.239187091588974, + "learning_rate": 5.392621832444758e-06, + "loss": 0.0198, + "step": 21710 + }, + { + "epoch": 1.3014560488944813, + "grad_norm": 0.3844532370567322, + "learning_rate": 5.384998967299016e-06, + "loss": 0.0179, + "step": 21720 + }, + { + "epoch": 1.3020552459703998, + "grad_norm": 0.3842040002346039, + "learning_rate": 5.377382691669737e-06, + "loss": 0.0204, + "step": 21730 + }, + { + "epoch": 1.3026544430463178, + "grad_norm": 0.26496851444244385, + "learning_rate": 5.369773014495048e-06, + "loss": 0.0172, + "step": 21740 + }, + { + "epoch": 1.3032536401222363, + "grad_norm": 0.40850451588630676, + "learning_rate": 5.36216994470533e-06, + "loss": 0.0189, + "step": 21750 + }, + { + "epoch": 1.3038528371981544, + "grad_norm": 0.21669425070285797, + "learning_rate": 5.354573491223212e-06, + "loss": 0.0192, + "step": 21760 + }, + { + "epoch": 1.3044520342740729, + "grad_norm": 0.43664559721946716, + "learning_rate": 5.3469836629635474e-06, + "loss": 0.021, + "step": 21770 + }, + { + "epoch": 1.305051231349991, + "grad_norm": 0.49064821004867554, + "learning_rate": 5.339400468833427e-06, + "loss": 0.02, + "step": 21780 + }, + { + "epoch": 1.3056504284259094, + "grad_norm": 0.9060949683189392, + "learning_rate": 5.3318239177321505e-06, + "loss": 0.0204, + "step": 21790 + }, + { + "epoch": 1.3062496255018274, + "grad_norm": 0.3413904309272766, + "learning_rate": 5.324254018551227e-06, + "loss": 0.0212, + "step": 21800 + }, + { + "epoch": 1.306848822577746, + "grad_norm": 0.2620849311351776, + "learning_rate": 5.316690780174352e-06, + "loss": 0.0201, + "step": 21810 + }, + { + "epoch": 1.307448019653664, + "grad_norm": 0.3972470760345459, + "learning_rate": 5.3091342114774016e-06, + "loss": 0.0216, + "step": 21820 + }, + { + "epoch": 1.3080472167295825, + "grad_norm": 0.4422028064727783, + "learning_rate": 5.301584321328435e-06, + "loss": 0.0177, + "step": 21830 + }, + { + "epoch": 1.3086464138055005, + "grad_norm": 0.2595955431461334, + "learning_rate": 5.294041118587667e-06, + "loss": 0.0214, + "step": 21840 + }, + { + "epoch": 1.309245610881419, + "grad_norm": 0.43522438406944275, + "learning_rate": 5.286504612107473e-06, + "loss": 0.0226, + "step": 21850 + }, + { + "epoch": 1.309844807957337, + "grad_norm": 0.33024686574935913, + "learning_rate": 5.278974810732353e-06, + "loss": 0.0199, + "step": 21860 + }, + { + "epoch": 1.3104440050332555, + "grad_norm": 0.3532852232456207, + "learning_rate": 5.271451723298952e-06, + "loss": 0.0194, + "step": 21870 + }, + { + "epoch": 1.3110432021091736, + "grad_norm": 0.3963644802570343, + "learning_rate": 5.263935358636034e-06, + "loss": 0.0171, + "step": 21880 + }, + { + "epoch": 1.311642399185092, + "grad_norm": 0.37003734707832336, + "learning_rate": 5.256425725564475e-06, + "loss": 0.0174, + "step": 21890 + }, + { + "epoch": 1.3122415962610101, + "grad_norm": 0.27832016348838806, + "learning_rate": 5.248922832897242e-06, + "loss": 0.0211, + "step": 21900 + }, + { + "epoch": 1.3128407933369286, + "grad_norm": 0.4203765392303467, + "learning_rate": 5.241426689439396e-06, + "loss": 0.0196, + "step": 21910 + }, + { + "epoch": 1.3134399904128466, + "grad_norm": 0.31796127557754517, + "learning_rate": 5.233937303988081e-06, + "loss": 0.019, + "step": 21920 + }, + { + "epoch": 1.3140391874887651, + "grad_norm": 0.4561198949813843, + "learning_rate": 5.22645468533251e-06, + "loss": 0.0198, + "step": 21930 + }, + { + "epoch": 1.3146383845646834, + "grad_norm": 0.4175209403038025, + "learning_rate": 5.2189788422539545e-06, + "loss": 0.0195, + "step": 21940 + }, + { + "epoch": 1.3152375816406017, + "grad_norm": 0.7017586827278137, + "learning_rate": 5.211509783525726e-06, + "loss": 0.0201, + "step": 21950 + }, + { + "epoch": 1.31583677871652, + "grad_norm": 0.4711352288722992, + "learning_rate": 5.2040475179131845e-06, + "loss": 0.02, + "step": 21960 + }, + { + "epoch": 1.3164359757924382, + "grad_norm": 0.2737489640712738, + "learning_rate": 5.196592054173714e-06, + "loss": 0.0198, + "step": 21970 + }, + { + "epoch": 1.3170351728683565, + "grad_norm": 0.44284430146217346, + "learning_rate": 5.189143401056722e-06, + "loss": 0.0206, + "step": 21980 + }, + { + "epoch": 1.3176343699442747, + "grad_norm": 0.4556163251399994, + "learning_rate": 5.181701567303612e-06, + "loss": 0.0208, + "step": 21990 + }, + { + "epoch": 1.318233567020193, + "grad_norm": 0.3158712685108185, + "learning_rate": 5.174266561647787e-06, + "loss": 0.0156, + "step": 22000 + }, + { + "epoch": 1.3188327640961113, + "grad_norm": 0.4620053172111511, + "learning_rate": 5.1668383928146455e-06, + "loss": 0.0187, + "step": 22010 + }, + { + "epoch": 1.3194319611720295, + "grad_norm": 0.7892107963562012, + "learning_rate": 5.159417069521556e-06, + "loss": 0.0195, + "step": 22020 + }, + { + "epoch": 1.3200311582479478, + "grad_norm": 0.37334534525871277, + "learning_rate": 5.152002600477859e-06, + "loss": 0.02, + "step": 22030 + }, + { + "epoch": 1.320630355323866, + "grad_norm": 0.4440039098262787, + "learning_rate": 5.144594994384839e-06, + "loss": 0.0244, + "step": 22040 + }, + { + "epoch": 1.3212295523997843, + "grad_norm": 0.2650533616542816, + "learning_rate": 5.137194259935739e-06, + "loss": 0.017, + "step": 22050 + }, + { + "epoch": 1.3218287494757026, + "grad_norm": 0.5425522327423096, + "learning_rate": 5.129800405815733e-06, + "loss": 0.019, + "step": 22060 + }, + { + "epoch": 1.3224279465516209, + "grad_norm": 0.5764152407646179, + "learning_rate": 5.122413440701921e-06, + "loss": 0.018, + "step": 22070 + }, + { + "epoch": 1.3230271436275391, + "grad_norm": 0.3985585868358612, + "learning_rate": 5.115033373263319e-06, + "loss": 0.0214, + "step": 22080 + }, + { + "epoch": 1.3236263407034574, + "grad_norm": 0.513511598110199, + "learning_rate": 5.107660212160841e-06, + "loss": 0.0189, + "step": 22090 + }, + { + "epoch": 1.3242255377793757, + "grad_norm": 0.3784070909023285, + "learning_rate": 5.100293966047308e-06, + "loss": 0.0164, + "step": 22100 + }, + { + "epoch": 1.324824734855294, + "grad_norm": 0.7029585242271423, + "learning_rate": 5.092934643567418e-06, + "loss": 0.0201, + "step": 22110 + }, + { + "epoch": 1.3254239319312122, + "grad_norm": 0.28351524472236633, + "learning_rate": 5.085582253357749e-06, + "loss": 0.0207, + "step": 22120 + }, + { + "epoch": 1.3260231290071305, + "grad_norm": 0.5500089526176453, + "learning_rate": 5.078236804046737e-06, + "loss": 0.0222, + "step": 22130 + }, + { + "epoch": 1.3266223260830488, + "grad_norm": 0.35926392674446106, + "learning_rate": 5.070898304254675e-06, + "loss": 0.0195, + "step": 22140 + }, + { + "epoch": 1.327221523158967, + "grad_norm": 0.24845866858959198, + "learning_rate": 5.063566762593704e-06, + "loss": 0.0198, + "step": 22150 + }, + { + "epoch": 1.3278207202348853, + "grad_norm": 0.3264683485031128, + "learning_rate": 5.056242187667797e-06, + "loss": 0.0178, + "step": 22160 + }, + { + "epoch": 1.3284199173108036, + "grad_norm": 0.47955816984176636, + "learning_rate": 5.04892458807275e-06, + "loss": 0.0206, + "step": 22170 + }, + { + "epoch": 1.3290191143867218, + "grad_norm": 0.31802570819854736, + "learning_rate": 5.04161397239617e-06, + "loss": 0.0168, + "step": 22180 + }, + { + "epoch": 1.32961831146264, + "grad_norm": 0.40685755014419556, + "learning_rate": 5.034310349217475e-06, + "loss": 0.0223, + "step": 22190 + }, + { + "epoch": 1.3302175085385584, + "grad_norm": 0.4924621284008026, + "learning_rate": 5.027013727107874e-06, + "loss": 0.0195, + "step": 22200 + }, + { + "epoch": 1.3308167056144766, + "grad_norm": 0.640724241733551, + "learning_rate": 5.01972411463036e-06, + "loss": 0.0183, + "step": 22210 + }, + { + "epoch": 1.331415902690395, + "grad_norm": 0.6712080836296082, + "learning_rate": 5.012441520339697e-06, + "loss": 0.0196, + "step": 22220 + }, + { + "epoch": 1.3320150997663132, + "grad_norm": 0.34785783290863037, + "learning_rate": 5.005165952782416e-06, + "loss": 0.0174, + "step": 22230 + }, + { + "epoch": 1.3326142968422314, + "grad_norm": 0.46851038932800293, + "learning_rate": 4.9978974204968e-06, + "loss": 0.0186, + "step": 22240 + }, + { + "epoch": 1.3332134939181497, + "grad_norm": 0.6138949394226074, + "learning_rate": 4.9906359320128804e-06, + "loss": 0.0197, + "step": 22250 + }, + { + "epoch": 1.333812690994068, + "grad_norm": 0.3083338439464569, + "learning_rate": 4.9833814958524115e-06, + "loss": 0.0179, + "step": 22260 + }, + { + "epoch": 1.3344118880699862, + "grad_norm": 0.3143295347690582, + "learning_rate": 4.976134120528886e-06, + "loss": 0.0217, + "step": 22270 + }, + { + "epoch": 1.3350110851459045, + "grad_norm": 0.3330692946910858, + "learning_rate": 4.9688938145474965e-06, + "loss": 0.0149, + "step": 22280 + }, + { + "epoch": 1.3356102822218228, + "grad_norm": 0.2732333242893219, + "learning_rate": 4.961660586405147e-06, + "loss": 0.017, + "step": 22290 + }, + { + "epoch": 1.336209479297741, + "grad_norm": 0.3350054621696472, + "learning_rate": 4.954434444590436e-06, + "loss": 0.022, + "step": 22300 + }, + { + "epoch": 1.3368086763736593, + "grad_norm": 0.2735322415828705, + "learning_rate": 4.947215397583639e-06, + "loss": 0.0181, + "step": 22310 + }, + { + "epoch": 1.3374078734495776, + "grad_norm": 0.5919206738471985, + "learning_rate": 4.9400034538567135e-06, + "loss": 0.0201, + "step": 22320 + }, + { + "epoch": 1.3380070705254958, + "grad_norm": 0.28201058506965637, + "learning_rate": 4.932798621873274e-06, + "loss": 0.0188, + "step": 22330 + }, + { + "epoch": 1.338606267601414, + "grad_norm": 0.505592942237854, + "learning_rate": 4.925600910088598e-06, + "loss": 0.0188, + "step": 22340 + }, + { + "epoch": 1.3392054646773324, + "grad_norm": 0.5231548547744751, + "learning_rate": 4.918410326949594e-06, + "loss": 0.0184, + "step": 22350 + }, + { + "epoch": 1.3398046617532506, + "grad_norm": 0.3743092715740204, + "learning_rate": 4.911226880894818e-06, + "loss": 0.0176, + "step": 22360 + }, + { + "epoch": 1.340403858829169, + "grad_norm": 0.5908241271972656, + "learning_rate": 4.9040505803544385e-06, + "loss": 0.0224, + "step": 22370 + }, + { + "epoch": 1.3410030559050872, + "grad_norm": 0.4231952428817749, + "learning_rate": 4.896881433750249e-06, + "loss": 0.0177, + "step": 22380 + }, + { + "epoch": 1.3416022529810054, + "grad_norm": 0.5666583180427551, + "learning_rate": 4.889719449495637e-06, + "loss": 0.0218, + "step": 22390 + }, + { + "epoch": 1.3422014500569237, + "grad_norm": 0.4740161597728729, + "learning_rate": 4.8825646359955926e-06, + "loss": 0.0179, + "step": 22400 + }, + { + "epoch": 1.342800647132842, + "grad_norm": 0.3947773873806, + "learning_rate": 4.8754170016466886e-06, + "loss": 0.02, + "step": 22410 + }, + { + "epoch": 1.3433998442087602, + "grad_norm": 0.3114109933376312, + "learning_rate": 4.868276554837072e-06, + "loss": 0.0223, + "step": 22420 + }, + { + "epoch": 1.3439990412846785, + "grad_norm": 0.44969403743743896, + "learning_rate": 4.861143303946457e-06, + "loss": 0.0169, + "step": 22430 + }, + { + "epoch": 1.3445982383605968, + "grad_norm": 0.29602059721946716, + "learning_rate": 4.854017257346105e-06, + "loss": 0.0168, + "step": 22440 + }, + { + "epoch": 1.345197435436515, + "grad_norm": 0.3884619474411011, + "learning_rate": 4.846898423398836e-06, + "loss": 0.0205, + "step": 22450 + }, + { + "epoch": 1.3457966325124333, + "grad_norm": 0.2929127514362335, + "learning_rate": 4.839786810458989e-06, + "loss": 0.0149, + "step": 22460 + }, + { + "epoch": 1.3463958295883516, + "grad_norm": 0.4955149292945862, + "learning_rate": 4.832682426872448e-06, + "loss": 0.0213, + "step": 22470 + }, + { + "epoch": 1.3469950266642698, + "grad_norm": 0.4021163582801819, + "learning_rate": 4.825585280976594e-06, + "loss": 0.0192, + "step": 22480 + }, + { + "epoch": 1.3475942237401881, + "grad_norm": 0.2945493757724762, + "learning_rate": 4.8184953811003274e-06, + "loss": 0.02, + "step": 22490 + }, + { + "epoch": 1.3481934208161064, + "grad_norm": 0.34085726737976074, + "learning_rate": 4.81141273556404e-06, + "loss": 0.0286, + "step": 22500 + }, + { + "epoch": 1.3487926178920246, + "grad_norm": 0.32751014828681946, + "learning_rate": 4.804337352679613e-06, + "loss": 0.0226, + "step": 22510 + }, + { + "epoch": 1.349391814967943, + "grad_norm": 0.3844929337501526, + "learning_rate": 4.7972692407503975e-06, + "loss": 0.0155, + "step": 22520 + }, + { + "epoch": 1.3499910120438612, + "grad_norm": 0.5286590456962585, + "learning_rate": 4.79020840807122e-06, + "loss": 0.0229, + "step": 22530 + }, + { + "epoch": 1.3505902091197795, + "grad_norm": 0.26664429903030396, + "learning_rate": 4.783154862928359e-06, + "loss": 0.0151, + "step": 22540 + }, + { + "epoch": 1.3511894061956977, + "grad_norm": 0.528367280960083, + "learning_rate": 4.776108613599547e-06, + "loss": 0.0239, + "step": 22550 + }, + { + "epoch": 1.351788603271616, + "grad_norm": 0.5871155858039856, + "learning_rate": 4.769069668353948e-06, + "loss": 0.0196, + "step": 22560 + }, + { + "epoch": 1.3523878003475343, + "grad_norm": 0.5686034560203552, + "learning_rate": 4.7620380354521524e-06, + "loss": 0.0184, + "step": 22570 + }, + { + "epoch": 1.3529869974234525, + "grad_norm": 0.40526366233825684, + "learning_rate": 4.755013723146175e-06, + "loss": 0.018, + "step": 22580 + }, + { + "epoch": 1.3535861944993708, + "grad_norm": 0.37055784463882446, + "learning_rate": 4.7479967396794376e-06, + "loss": 0.0184, + "step": 22590 + }, + { + "epoch": 1.354185391575289, + "grad_norm": 0.5210561156272888, + "learning_rate": 4.740987093286766e-06, + "loss": 0.0165, + "step": 22600 + }, + { + "epoch": 1.3547845886512073, + "grad_norm": 0.3386324942111969, + "learning_rate": 4.733984792194363e-06, + "loss": 0.018, + "step": 22610 + }, + { + "epoch": 1.3553837857271256, + "grad_norm": 0.40071168541908264, + "learning_rate": 4.726989844619823e-06, + "loss": 0.0198, + "step": 22620 + }, + { + "epoch": 1.3559829828030439, + "grad_norm": 0.3415983319282532, + "learning_rate": 4.720002258772107e-06, + "loss": 0.0168, + "step": 22630 + }, + { + "epoch": 1.3565821798789621, + "grad_norm": 0.3700709939002991, + "learning_rate": 4.713022042851537e-06, + "loss": 0.0166, + "step": 22640 + }, + { + "epoch": 1.3571813769548804, + "grad_norm": 0.3559338450431824, + "learning_rate": 4.706049205049784e-06, + "loss": 0.0174, + "step": 22650 + }, + { + "epoch": 1.3577805740307987, + "grad_norm": 0.5588265657424927, + "learning_rate": 4.699083753549858e-06, + "loss": 0.0207, + "step": 22660 + }, + { + "epoch": 1.358379771106717, + "grad_norm": 0.4539838433265686, + "learning_rate": 4.692125696526107e-06, + "loss": 0.0164, + "step": 22670 + }, + { + "epoch": 1.3589789681826352, + "grad_norm": 0.34879690408706665, + "learning_rate": 4.6851750421442e-06, + "loss": 0.0165, + "step": 22680 + }, + { + "epoch": 1.3595781652585535, + "grad_norm": 0.22862373292446136, + "learning_rate": 4.678231798561118e-06, + "loss": 0.0158, + "step": 22690 + }, + { + "epoch": 1.3601773623344717, + "grad_norm": 0.5536275506019592, + "learning_rate": 4.67129597392514e-06, + "loss": 0.0137, + "step": 22700 + }, + { + "epoch": 1.36077655941039, + "grad_norm": 0.5599532127380371, + "learning_rate": 4.664367576375844e-06, + "loss": 0.0206, + "step": 22710 + }, + { + "epoch": 1.3613757564863083, + "grad_norm": 0.2961312532424927, + "learning_rate": 4.65744661404409e-06, + "loss": 0.0138, + "step": 22720 + }, + { + "epoch": 1.3619749535622265, + "grad_norm": 0.5834526419639587, + "learning_rate": 4.650533095052018e-06, + "loss": 0.0174, + "step": 22730 + }, + { + "epoch": 1.362574150638145, + "grad_norm": 0.5941792726516724, + "learning_rate": 4.643627027513023e-06, + "loss": 0.0205, + "step": 22740 + }, + { + "epoch": 1.363173347714063, + "grad_norm": 0.2580801844596863, + "learning_rate": 4.636728419531758e-06, + "loss": 0.0199, + "step": 22750 + }, + { + "epoch": 1.3637725447899816, + "grad_norm": 0.3897567689418793, + "learning_rate": 4.629837279204125e-06, + "loss": 0.0168, + "step": 22760 + }, + { + "epoch": 1.3643717418658996, + "grad_norm": 0.37937042117118835, + "learning_rate": 4.6229536146172635e-06, + "loss": 0.0213, + "step": 22770 + }, + { + "epoch": 1.364970938941818, + "grad_norm": 0.3964179456233978, + "learning_rate": 4.616077433849538e-06, + "loss": 0.019, + "step": 22780 + }, + { + "epoch": 1.3655701360177361, + "grad_norm": 0.3632303476333618, + "learning_rate": 4.609208744970524e-06, + "loss": 0.015, + "step": 22790 + }, + { + "epoch": 1.3661693330936546, + "grad_norm": 0.5750122666358948, + "learning_rate": 4.602347556041014e-06, + "loss": 0.0168, + "step": 22800 + }, + { + "epoch": 1.3667685301695727, + "grad_norm": 0.36310067772865295, + "learning_rate": 4.595493875112996e-06, + "loss": 0.0172, + "step": 22810 + }, + { + "epoch": 1.3673677272454912, + "grad_norm": 0.5438339114189148, + "learning_rate": 4.5886477102296485e-06, + "loss": 0.0198, + "step": 22820 + }, + { + "epoch": 1.3679669243214092, + "grad_norm": 0.37394630908966064, + "learning_rate": 4.5818090694253246e-06, + "loss": 0.0202, + "step": 22830 + }, + { + "epoch": 1.3685661213973277, + "grad_norm": 0.2454962432384491, + "learning_rate": 4.574977960725548e-06, + "loss": 0.0188, + "step": 22840 + }, + { + "epoch": 1.3691653184732457, + "grad_norm": 0.474844366312027, + "learning_rate": 4.568154392147005e-06, + "loss": 0.0223, + "step": 22850 + }, + { + "epoch": 1.3697645155491642, + "grad_norm": 0.30256277322769165, + "learning_rate": 4.561338371697538e-06, + "loss": 0.0188, + "step": 22860 + }, + { + "epoch": 1.3703637126250823, + "grad_norm": 0.500045657157898, + "learning_rate": 4.554529907376127e-06, + "loss": 0.0179, + "step": 22870 + }, + { + "epoch": 1.3709629097010008, + "grad_norm": 0.609107494354248, + "learning_rate": 4.547729007172878e-06, + "loss": 0.0182, + "step": 22880 + }, + { + "epoch": 1.3715621067769188, + "grad_norm": 0.20867787301540375, + "learning_rate": 4.54093567906903e-06, + "loss": 0.0168, + "step": 22890 + }, + { + "epoch": 1.3721613038528373, + "grad_norm": 0.41653770208358765, + "learning_rate": 4.534149931036931e-06, + "loss": 0.0172, + "step": 22900 + }, + { + "epoch": 1.3727605009287553, + "grad_norm": 0.357435941696167, + "learning_rate": 4.527371771040039e-06, + "loss": 0.017, + "step": 22910 + }, + { + "epoch": 1.3733596980046738, + "grad_norm": 0.5994096994400024, + "learning_rate": 4.520601207032894e-06, + "loss": 0.0177, + "step": 22920 + }, + { + "epoch": 1.3739588950805919, + "grad_norm": 0.3150171935558319, + "learning_rate": 4.513838246961138e-06, + "loss": 0.0164, + "step": 22930 + }, + { + "epoch": 1.3745580921565104, + "grad_norm": 0.4483601748943329, + "learning_rate": 4.507082898761475e-06, + "loss": 0.019, + "step": 22940 + }, + { + "epoch": 1.3751572892324284, + "grad_norm": 0.529812753200531, + "learning_rate": 4.500335170361684e-06, + "loss": 0.0175, + "step": 22950 + }, + { + "epoch": 1.375756486308347, + "grad_norm": 0.26758334040641785, + "learning_rate": 4.493595069680604e-06, + "loss": 0.0187, + "step": 22960 + }, + { + "epoch": 1.376355683384265, + "grad_norm": 0.3228643834590912, + "learning_rate": 4.486862604628113e-06, + "loss": 0.0195, + "step": 22970 + }, + { + "epoch": 1.3769548804601834, + "grad_norm": 0.3437839150428772, + "learning_rate": 4.480137783105137e-06, + "loss": 0.0207, + "step": 22980 + }, + { + "epoch": 1.3775540775361017, + "grad_norm": 0.28592896461486816, + "learning_rate": 4.473420613003627e-06, + "loss": 0.0185, + "step": 22990 + }, + { + "epoch": 1.37815327461202, + "grad_norm": 0.5544041991233826, + "learning_rate": 4.46671110220656e-06, + "loss": 0.0191, + "step": 23000 + }, + { + "epoch": 1.3787524716879382, + "grad_norm": 1.0831762552261353, + "learning_rate": 4.460009258587915e-06, + "loss": 0.0237, + "step": 23010 + }, + { + "epoch": 1.3793516687638565, + "grad_norm": 0.3546636700630188, + "learning_rate": 4.453315090012686e-06, + "loss": 0.0203, + "step": 23020 + }, + { + "epoch": 1.3799508658397748, + "grad_norm": 0.32998642325401306, + "learning_rate": 4.446628604336844e-06, + "loss": 0.018, + "step": 23030 + }, + { + "epoch": 1.380550062915693, + "grad_norm": 0.40987834334373474, + "learning_rate": 4.439949809407357e-06, + "loss": 0.0189, + "step": 23040 + }, + { + "epoch": 1.3811492599916113, + "grad_norm": 0.6094655990600586, + "learning_rate": 4.433278713062166e-06, + "loss": 0.0174, + "step": 23050 + }, + { + "epoch": 1.3817484570675296, + "grad_norm": 0.631481409072876, + "learning_rate": 4.426615323130167e-06, + "loss": 0.0179, + "step": 23060 + }, + { + "epoch": 1.3823476541434478, + "grad_norm": 0.4069002866744995, + "learning_rate": 4.4199596474312235e-06, + "loss": 0.0192, + "step": 23070 + }, + { + "epoch": 1.3829468512193661, + "grad_norm": 0.36600202322006226, + "learning_rate": 4.413311693776142e-06, + "loss": 0.0196, + "step": 23080 + }, + { + "epoch": 1.3835460482952844, + "grad_norm": 0.3092246353626251, + "learning_rate": 4.406671469966666e-06, + "loss": 0.0185, + "step": 23090 + }, + { + "epoch": 1.3841452453712026, + "grad_norm": 0.2811580300331116, + "learning_rate": 4.400038983795467e-06, + "loss": 0.0162, + "step": 23100 + }, + { + "epoch": 1.384744442447121, + "grad_norm": 0.4177345037460327, + "learning_rate": 4.393414243046139e-06, + "loss": 0.0196, + "step": 23110 + }, + { + "epoch": 1.3853436395230392, + "grad_norm": 0.40211164951324463, + "learning_rate": 4.386797255493181e-06, + "loss": 0.0199, + "step": 23120 + }, + { + "epoch": 1.3859428365989575, + "grad_norm": 0.31014713644981384, + "learning_rate": 4.380188028901997e-06, + "loss": 0.0173, + "step": 23130 + }, + { + "epoch": 1.3865420336748757, + "grad_norm": 0.5378808379173279, + "learning_rate": 4.373586571028887e-06, + "loss": 0.0189, + "step": 23140 + }, + { + "epoch": 1.387141230750794, + "grad_norm": 0.3483606278896332, + "learning_rate": 4.366992889621023e-06, + "loss": 0.0201, + "step": 23150 + }, + { + "epoch": 1.3877404278267123, + "grad_norm": 0.5112893581390381, + "learning_rate": 4.360406992416461e-06, + "loss": 0.0209, + "step": 23160 + }, + { + "epoch": 1.3883396249026305, + "grad_norm": 0.26471400260925293, + "learning_rate": 4.3538288871441184e-06, + "loss": 0.013, + "step": 23170 + }, + { + "epoch": 1.3889388219785488, + "grad_norm": 0.6770564317703247, + "learning_rate": 4.34725858152377e-06, + "loss": 0.0174, + "step": 23180 + }, + { + "epoch": 1.389538019054467, + "grad_norm": 0.4251134693622589, + "learning_rate": 4.34069608326603e-06, + "loss": 0.0169, + "step": 23190 + }, + { + "epoch": 1.3901372161303853, + "grad_norm": 0.2985415458679199, + "learning_rate": 4.334141400072358e-06, + "loss": 0.0212, + "step": 23200 + }, + { + "epoch": 1.3907364132063036, + "grad_norm": 0.4635870158672333, + "learning_rate": 4.327594539635044e-06, + "loss": 0.0164, + "step": 23210 + }, + { + "epoch": 1.3913356102822219, + "grad_norm": 0.4360525906085968, + "learning_rate": 4.321055509637188e-06, + "loss": 0.0174, + "step": 23220 + }, + { + "epoch": 1.3919348073581401, + "grad_norm": 0.6121042370796204, + "learning_rate": 4.314524317752708e-06, + "loss": 0.0176, + "step": 23230 + }, + { + "epoch": 1.3925340044340584, + "grad_norm": 0.3049333095550537, + "learning_rate": 4.308000971646321e-06, + "loss": 0.0162, + "step": 23240 + }, + { + "epoch": 1.3931332015099767, + "grad_norm": 0.46471482515335083, + "learning_rate": 4.301485478973536e-06, + "loss": 0.0193, + "step": 23250 + }, + { + "epoch": 1.393732398585895, + "grad_norm": 0.27093327045440674, + "learning_rate": 4.294977847380648e-06, + "loss": 0.0204, + "step": 23260 + }, + { + "epoch": 1.3943315956618132, + "grad_norm": 0.3513331711292267, + "learning_rate": 4.288478084504728e-06, + "loss": 0.0209, + "step": 23270 + }, + { + "epoch": 1.3949307927377315, + "grad_norm": 0.3452320396900177, + "learning_rate": 4.281986197973603e-06, + "loss": 0.0172, + "step": 23280 + }, + { + "epoch": 1.3955299898136497, + "grad_norm": 0.44609951972961426, + "learning_rate": 4.275502195405868e-06, + "loss": 0.0198, + "step": 23290 + }, + { + "epoch": 1.396129186889568, + "grad_norm": 0.27217286825180054, + "learning_rate": 4.269026084410863e-06, + "loss": 0.016, + "step": 23300 + }, + { + "epoch": 1.3967283839654863, + "grad_norm": 0.5857428908348083, + "learning_rate": 4.262557872588659e-06, + "loss": 0.0206, + "step": 23310 + }, + { + "epoch": 1.3973275810414045, + "grad_norm": 0.3834620714187622, + "learning_rate": 4.25609756753007e-06, + "loss": 0.0165, + "step": 23320 + }, + { + "epoch": 1.3979267781173228, + "grad_norm": 0.34176892042160034, + "learning_rate": 4.249645176816617e-06, + "loss": 0.0156, + "step": 23330 + }, + { + "epoch": 1.398525975193241, + "grad_norm": 0.2497260719537735, + "learning_rate": 4.243200708020542e-06, + "loss": 0.0183, + "step": 23340 + }, + { + "epoch": 1.3991251722691593, + "grad_norm": 0.3003418743610382, + "learning_rate": 4.236764168704789e-06, + "loss": 0.0188, + "step": 23350 + }, + { + "epoch": 1.3997243693450776, + "grad_norm": 0.19922316074371338, + "learning_rate": 4.230335566422999e-06, + "loss": 0.0162, + "step": 23360 + }, + { + "epoch": 1.4003235664209959, + "grad_norm": 0.5160003900527954, + "learning_rate": 4.223914908719488e-06, + "loss": 0.0181, + "step": 23370 + }, + { + "epoch": 1.4009227634969141, + "grad_norm": 0.4917953312397003, + "learning_rate": 4.217502203129258e-06, + "loss": 0.0197, + "step": 23380 + }, + { + "epoch": 1.4015219605728324, + "grad_norm": 0.2868032455444336, + "learning_rate": 4.211097457177978e-06, + "loss": 0.0192, + "step": 23390 + }, + { + "epoch": 1.4021211576487507, + "grad_norm": 0.30980560183525085, + "learning_rate": 4.204700678381975e-06, + "loss": 0.0178, + "step": 23400 + }, + { + "epoch": 1.402720354724669, + "grad_norm": 0.31523144245147705, + "learning_rate": 4.198311874248223e-06, + "loss": 0.0193, + "step": 23410 + }, + { + "epoch": 1.4033195518005872, + "grad_norm": 0.23731909692287445, + "learning_rate": 4.191931052274337e-06, + "loss": 0.0171, + "step": 23420 + }, + { + "epoch": 1.4039187488765055, + "grad_norm": 0.4911767542362213, + "learning_rate": 4.185558219948571e-06, + "loss": 0.0171, + "step": 23430 + }, + { + "epoch": 1.4045179459524237, + "grad_norm": 0.3095512390136719, + "learning_rate": 4.1791933847497956e-06, + "loss": 0.0165, + "step": 23440 + }, + { + "epoch": 1.405117143028342, + "grad_norm": 0.6421821117401123, + "learning_rate": 4.172836554147505e-06, + "loss": 0.0178, + "step": 23450 + }, + { + "epoch": 1.4057163401042603, + "grad_norm": 0.4887765645980835, + "learning_rate": 4.166487735601787e-06, + "loss": 0.0212, + "step": 23460 + }, + { + "epoch": 1.4063155371801785, + "grad_norm": 0.4543951451778412, + "learning_rate": 4.160146936563338e-06, + "loss": 0.0165, + "step": 23470 + }, + { + "epoch": 1.4069147342560968, + "grad_norm": 0.4595223367214203, + "learning_rate": 4.153814164473437e-06, + "loss": 0.0144, + "step": 23480 + }, + { + "epoch": 1.407513931332015, + "grad_norm": 0.6325511336326599, + "learning_rate": 4.1474894267639476e-06, + "loss": 0.0203, + "step": 23490 + }, + { + "epoch": 1.4081131284079333, + "grad_norm": 0.6220779418945312, + "learning_rate": 4.141172730857301e-06, + "loss": 0.0225, + "step": 23500 + }, + { + "epoch": 1.4087123254838516, + "grad_norm": 0.3728989362716675, + "learning_rate": 4.1348640841664874e-06, + "loss": 0.0202, + "step": 23510 + }, + { + "epoch": 1.4093115225597699, + "grad_norm": 0.4958861470222473, + "learning_rate": 4.128563494095059e-06, + "loss": 0.0204, + "step": 23520 + }, + { + "epoch": 1.4099107196356881, + "grad_norm": 0.32445529103279114, + "learning_rate": 4.122270968037107e-06, + "loss": 0.016, + "step": 23530 + }, + { + "epoch": 1.4105099167116064, + "grad_norm": 0.3969140350818634, + "learning_rate": 4.115986513377266e-06, + "loss": 0.0174, + "step": 23540 + }, + { + "epoch": 1.4111091137875247, + "grad_norm": 0.39698946475982666, + "learning_rate": 4.109710137490687e-06, + "loss": 0.0163, + "step": 23550 + }, + { + "epoch": 1.411708310863443, + "grad_norm": 0.4633882939815521, + "learning_rate": 4.103441847743051e-06, + "loss": 0.0179, + "step": 23560 + }, + { + "epoch": 1.4123075079393612, + "grad_norm": 0.36993899941444397, + "learning_rate": 4.097181651490543e-06, + "loss": 0.0216, + "step": 23570 + }, + { + "epoch": 1.4129067050152795, + "grad_norm": 0.4137882590293884, + "learning_rate": 4.090929556079854e-06, + "loss": 0.0187, + "step": 23580 + }, + { + "epoch": 1.4135059020911978, + "grad_norm": 0.320867121219635, + "learning_rate": 4.084685568848168e-06, + "loss": 0.0238, + "step": 23590 + }, + { + "epoch": 1.414105099167116, + "grad_norm": 0.3139745593070984, + "learning_rate": 4.078449697123146e-06, + "loss": 0.0175, + "step": 23600 + }, + { + "epoch": 1.4147042962430343, + "grad_norm": 0.572628378868103, + "learning_rate": 4.072221948222934e-06, + "loss": 0.018, + "step": 23610 + }, + { + "epoch": 1.4153034933189526, + "grad_norm": 0.575975239276886, + "learning_rate": 4.066002329456142e-06, + "loss": 0.0189, + "step": 23620 + }, + { + "epoch": 1.4159026903948708, + "grad_norm": 0.26301854848861694, + "learning_rate": 4.05979084812184e-06, + "loss": 0.0121, + "step": 23630 + }, + { + "epoch": 1.416501887470789, + "grad_norm": 0.3042408525943756, + "learning_rate": 4.053587511509546e-06, + "loss": 0.0185, + "step": 23640 + }, + { + "epoch": 1.4171010845467074, + "grad_norm": 0.2503415644168854, + "learning_rate": 4.04739232689922e-06, + "loss": 0.0208, + "step": 23650 + }, + { + "epoch": 1.4177002816226256, + "grad_norm": 0.3556166887283325, + "learning_rate": 4.0412053015612584e-06, + "loss": 0.0202, + "step": 23660 + }, + { + "epoch": 1.418299478698544, + "grad_norm": 0.652975857257843, + "learning_rate": 4.035026442756482e-06, + "loss": 0.0194, + "step": 23670 + }, + { + "epoch": 1.4188986757744622, + "grad_norm": 0.4215060770511627, + "learning_rate": 4.028855757736123e-06, + "loss": 0.0166, + "step": 23680 + }, + { + "epoch": 1.4194978728503804, + "grad_norm": 0.2277296483516693, + "learning_rate": 4.022693253741821e-06, + "loss": 0.0172, + "step": 23690 + }, + { + "epoch": 1.4200970699262987, + "grad_norm": 0.3370293378829956, + "learning_rate": 4.016538938005621e-06, + "loss": 0.0201, + "step": 23700 + }, + { + "epoch": 1.420696267002217, + "grad_norm": 0.4235946834087372, + "learning_rate": 4.0103928177499565e-06, + "loss": 0.0189, + "step": 23710 + }, + { + "epoch": 1.4212954640781352, + "grad_norm": 1.0387974977493286, + "learning_rate": 4.004254900187642e-06, + "loss": 0.0176, + "step": 23720 + }, + { + "epoch": 1.4218946611540535, + "grad_norm": 0.7258256077766418, + "learning_rate": 3.998125192521861e-06, + "loss": 0.0204, + "step": 23730 + }, + { + "epoch": 1.4224938582299718, + "grad_norm": 0.35412806272506714, + "learning_rate": 3.992003701946171e-06, + "loss": 0.0165, + "step": 23740 + }, + { + "epoch": 1.42309305530589, + "grad_norm": 0.5192556977272034, + "learning_rate": 3.9858904356444815e-06, + "loss": 0.0166, + "step": 23750 + }, + { + "epoch": 1.4236922523818083, + "grad_norm": 0.3292843699455261, + "learning_rate": 3.979785400791052e-06, + "loss": 0.0163, + "step": 23760 + }, + { + "epoch": 1.4242914494577266, + "grad_norm": 0.46782153844833374, + "learning_rate": 3.973688604550481e-06, + "loss": 0.0174, + "step": 23770 + }, + { + "epoch": 1.4248906465336448, + "grad_norm": 0.6324945092201233, + "learning_rate": 3.9676000540776944e-06, + "loss": 0.0183, + "step": 23780 + }, + { + "epoch": 1.4254898436095633, + "grad_norm": 0.4347882568836212, + "learning_rate": 3.961519756517947e-06, + "loss": 0.0138, + "step": 23790 + }, + { + "epoch": 1.4260890406854814, + "grad_norm": 0.3393082320690155, + "learning_rate": 3.955447719006809e-06, + "loss": 0.0155, + "step": 23800 + }, + { + "epoch": 1.4266882377613999, + "grad_norm": 0.28411221504211426, + "learning_rate": 3.949383948670156e-06, + "loss": 0.016, + "step": 23810 + }, + { + "epoch": 1.427287434837318, + "grad_norm": 0.45982369780540466, + "learning_rate": 3.9433284526241535e-06, + "loss": 0.0134, + "step": 23820 + }, + { + "epoch": 1.4278866319132364, + "grad_norm": 0.32810381054878235, + "learning_rate": 3.937281237975269e-06, + "loss": 0.0163, + "step": 23830 + }, + { + "epoch": 1.4284858289891544, + "grad_norm": 0.5996097922325134, + "learning_rate": 3.9312423118202426e-06, + "loss": 0.0246, + "step": 23840 + }, + { + "epoch": 1.429085026065073, + "grad_norm": 0.40002167224884033, + "learning_rate": 3.925211681246096e-06, + "loss": 0.0158, + "step": 23850 + }, + { + "epoch": 1.429684223140991, + "grad_norm": 0.4102090299129486, + "learning_rate": 3.919189353330104e-06, + "loss": 0.0179, + "step": 23860 + }, + { + "epoch": 1.4302834202169095, + "grad_norm": 0.44915929436683655, + "learning_rate": 3.913175335139808e-06, + "loss": 0.019, + "step": 23870 + }, + { + "epoch": 1.4308826172928275, + "grad_norm": 0.251206636428833, + "learning_rate": 3.907169633732989e-06, + "loss": 0.0183, + "step": 23880 + }, + { + "epoch": 1.431481814368746, + "grad_norm": 0.2564012408256531, + "learning_rate": 3.901172256157674e-06, + "loss": 0.0182, + "step": 23890 + }, + { + "epoch": 1.432081011444664, + "grad_norm": 0.431265652179718, + "learning_rate": 3.895183209452123e-06, + "loss": 0.0177, + "step": 23900 + }, + { + "epoch": 1.4326802085205825, + "grad_norm": 0.42389997839927673, + "learning_rate": 3.889202500644811e-06, + "loss": 0.0146, + "step": 23910 + }, + { + "epoch": 1.4332794055965006, + "grad_norm": 0.9380725622177124, + "learning_rate": 3.883230136754435e-06, + "loss": 0.0206, + "step": 23920 + }, + { + "epoch": 1.433878602672419, + "grad_norm": 0.3655669093132019, + "learning_rate": 3.877266124789896e-06, + "loss": 0.0151, + "step": 23930 + }, + { + "epoch": 1.4344777997483371, + "grad_norm": 0.3248157501220703, + "learning_rate": 3.871310471750298e-06, + "loss": 0.0152, + "step": 23940 + }, + { + "epoch": 1.4350769968242556, + "grad_norm": 0.5733596086502075, + "learning_rate": 3.865363184624925e-06, + "loss": 0.0175, + "step": 23950 + }, + { + "epoch": 1.4356761939001736, + "grad_norm": 0.4672720730304718, + "learning_rate": 3.859424270393256e-06, + "loss": 0.0185, + "step": 23960 + }, + { + "epoch": 1.4362753909760921, + "grad_norm": 0.22989575564861298, + "learning_rate": 3.853493736024934e-06, + "loss": 0.0165, + "step": 23970 + }, + { + "epoch": 1.4368745880520102, + "grad_norm": 1.0956321954727173, + "learning_rate": 3.84757158847977e-06, + "loss": 0.0181, + "step": 23980 + }, + { + "epoch": 1.4374737851279287, + "grad_norm": 0.39079031348228455, + "learning_rate": 3.841657834707739e-06, + "loss": 0.0212, + "step": 23990 + }, + { + "epoch": 1.4380729822038467, + "grad_norm": 0.3974068760871887, + "learning_rate": 3.8357524816489565e-06, + "loss": 0.0197, + "step": 24000 + }, + { + "epoch": 1.4386721792797652, + "grad_norm": 1.1926871538162231, + "learning_rate": 3.829855536233683e-06, + "loss": 0.0185, + "step": 24010 + }, + { + "epoch": 1.4392713763556833, + "grad_norm": 0.40923064947128296, + "learning_rate": 3.823967005382315e-06, + "loss": 0.0171, + "step": 24020 + }, + { + "epoch": 1.4398705734316017, + "grad_norm": 0.38384920358657837, + "learning_rate": 3.8180868960053725e-06, + "loss": 0.0161, + "step": 24030 + }, + { + "epoch": 1.4404697705075198, + "grad_norm": 0.21791735291481018, + "learning_rate": 3.8122152150034863e-06, + "loss": 0.0168, + "step": 24040 + }, + { + "epoch": 1.4410689675834383, + "grad_norm": 0.3207184672355652, + "learning_rate": 3.806351969267404e-06, + "loss": 0.0147, + "step": 24050 + }, + { + "epoch": 1.4416681646593565, + "grad_norm": 0.4831724166870117, + "learning_rate": 3.8004971656779733e-06, + "loss": 0.0182, + "step": 24060 + }, + { + "epoch": 1.4422673617352748, + "grad_norm": 0.47996360063552856, + "learning_rate": 3.794650811106129e-06, + "loss": 0.0175, + "step": 24070 + }, + { + "epoch": 1.442866558811193, + "grad_norm": 0.41330286860466003, + "learning_rate": 3.7888129124128946e-06, + "loss": 0.0172, + "step": 24080 + }, + { + "epoch": 1.4434657558871113, + "grad_norm": 0.5012956857681274, + "learning_rate": 3.782983476449369e-06, + "loss": 0.0234, + "step": 24090 + }, + { + "epoch": 1.4440649529630296, + "grad_norm": 0.4715912640094757, + "learning_rate": 3.777162510056721e-06, + "loss": 0.016, + "step": 24100 + }, + { + "epoch": 1.4446641500389479, + "grad_norm": 0.3817141652107239, + "learning_rate": 3.771350020066177e-06, + "loss": 0.0158, + "step": 24110 + }, + { + "epoch": 1.4452633471148661, + "grad_norm": 0.3964484930038452, + "learning_rate": 3.765546013299023e-06, + "loss": 0.0216, + "step": 24120 + }, + { + "epoch": 1.4458625441907844, + "grad_norm": 0.29786166548728943, + "learning_rate": 3.759750496566577e-06, + "loss": 0.0197, + "step": 24130 + }, + { + "epoch": 1.4464617412667027, + "grad_norm": 0.2796359360218048, + "learning_rate": 3.7539634766702047e-06, + "loss": 0.018, + "step": 24140 + }, + { + "epoch": 1.447060938342621, + "grad_norm": 0.30957916378974915, + "learning_rate": 3.748184960401299e-06, + "loss": 0.0154, + "step": 24150 + }, + { + "epoch": 1.4476601354185392, + "grad_norm": 0.3837800920009613, + "learning_rate": 3.742414954541267e-06, + "loss": 0.0171, + "step": 24160 + }, + { + "epoch": 1.4482593324944575, + "grad_norm": 0.29726749658584595, + "learning_rate": 3.7366534658615293e-06, + "loss": 0.0167, + "step": 24170 + }, + { + "epoch": 1.4488585295703758, + "grad_norm": 0.4624067544937134, + "learning_rate": 3.730900501123518e-06, + "loss": 0.0187, + "step": 24180 + }, + { + "epoch": 1.449457726646294, + "grad_norm": 0.46996721625328064, + "learning_rate": 3.7251560670786545e-06, + "loss": 0.0156, + "step": 24190 + }, + { + "epoch": 1.4500569237222123, + "grad_norm": 0.351532518863678, + "learning_rate": 3.7194201704683563e-06, + "loss": 0.0221, + "step": 24200 + }, + { + "epoch": 1.4506561207981306, + "grad_norm": 0.5119938254356384, + "learning_rate": 3.713692818024016e-06, + "loss": 0.0194, + "step": 24210 + }, + { + "epoch": 1.4512553178740488, + "grad_norm": 0.5102914571762085, + "learning_rate": 3.707974016467e-06, + "loss": 0.0152, + "step": 24220 + }, + { + "epoch": 1.451854514949967, + "grad_norm": 0.4638414680957794, + "learning_rate": 3.7022637725086386e-06, + "loss": 0.0167, + "step": 24230 + }, + { + "epoch": 1.4524537120258854, + "grad_norm": 0.6181433200836182, + "learning_rate": 3.696562092850226e-06, + "loss": 0.016, + "step": 24240 + }, + { + "epoch": 1.4530529091018036, + "grad_norm": 0.31810933351516724, + "learning_rate": 3.690868984182998e-06, + "loss": 0.0175, + "step": 24250 + }, + { + "epoch": 1.453652106177722, + "grad_norm": 0.20725348591804504, + "learning_rate": 3.6851844531881325e-06, + "loss": 0.0139, + "step": 24260 + }, + { + "epoch": 1.4542513032536402, + "grad_norm": 0.29788675904273987, + "learning_rate": 3.679508506536745e-06, + "loss": 0.0153, + "step": 24270 + }, + { + "epoch": 1.4548505003295584, + "grad_norm": 0.286422997713089, + "learning_rate": 3.6738411508898753e-06, + "loss": 0.0171, + "step": 24280 + }, + { + "epoch": 1.4554496974054767, + "grad_norm": 0.31199127435684204, + "learning_rate": 3.668182392898484e-06, + "loss": 0.0183, + "step": 24290 + }, + { + "epoch": 1.456048894481395, + "grad_norm": 0.5850293040275574, + "learning_rate": 3.6625322392034334e-06, + "loss": 0.0199, + "step": 24300 + }, + { + "epoch": 1.4566480915573132, + "grad_norm": 0.5558650493621826, + "learning_rate": 3.6568906964354963e-06, + "loss": 0.0153, + "step": 24310 + }, + { + "epoch": 1.4572472886332315, + "grad_norm": 0.5221429467201233, + "learning_rate": 3.6512577712153373e-06, + "loss": 0.0159, + "step": 24320 + }, + { + "epoch": 1.4578464857091498, + "grad_norm": 0.40443119406700134, + "learning_rate": 3.6456334701535094e-06, + "loss": 0.0176, + "step": 24330 + }, + { + "epoch": 1.458445682785068, + "grad_norm": 0.4657982289791107, + "learning_rate": 3.6400177998504416e-06, + "loss": 0.0183, + "step": 24340 + }, + { + "epoch": 1.4590448798609863, + "grad_norm": 0.23784635961055756, + "learning_rate": 3.6344107668964334e-06, + "loss": 0.0156, + "step": 24350 + }, + { + "epoch": 1.4596440769369046, + "grad_norm": 0.3992721438407898, + "learning_rate": 3.6288123778716534e-06, + "loss": 0.0223, + "step": 24360 + }, + { + "epoch": 1.4602432740128228, + "grad_norm": 0.3949171304702759, + "learning_rate": 3.6232226393461205e-06, + "loss": 0.014, + "step": 24370 + }, + { + "epoch": 1.460842471088741, + "grad_norm": 0.33738628029823303, + "learning_rate": 3.6176415578797095e-06, + "loss": 0.0157, + "step": 24380 + }, + { + "epoch": 1.4614416681646594, + "grad_norm": 0.42644673585891724, + "learning_rate": 3.612069140022124e-06, + "loss": 0.0153, + "step": 24390 + }, + { + "epoch": 1.4620408652405776, + "grad_norm": 0.25812193751335144, + "learning_rate": 3.6065053923129094e-06, + "loss": 0.0173, + "step": 24400 + }, + { + "epoch": 1.462640062316496, + "grad_norm": 0.29154765605926514, + "learning_rate": 3.6009503212814335e-06, + "loss": 0.0162, + "step": 24410 + }, + { + "epoch": 1.4632392593924142, + "grad_norm": 0.3526030480861664, + "learning_rate": 3.595403933446885e-06, + "loss": 0.0182, + "step": 24420 + }, + { + "epoch": 1.4638384564683324, + "grad_norm": 0.731890857219696, + "learning_rate": 3.589866235318254e-06, + "loss": 0.0166, + "step": 24430 + }, + { + "epoch": 1.4644376535442507, + "grad_norm": 0.34727898240089417, + "learning_rate": 3.584337233394337e-06, + "loss": 0.0146, + "step": 24440 + }, + { + "epoch": 1.465036850620169, + "grad_norm": 0.4517475962638855, + "learning_rate": 3.5788169341637304e-06, + "loss": 0.0229, + "step": 24450 + }, + { + "epoch": 1.4656360476960872, + "grad_norm": 0.3026634156703949, + "learning_rate": 3.573305344104808e-06, + "loss": 0.0174, + "step": 24460 + }, + { + "epoch": 1.4662352447720055, + "grad_norm": 0.20546412467956543, + "learning_rate": 3.5678024696857336e-06, + "loss": 0.0177, + "step": 24470 + }, + { + "epoch": 1.4668344418479238, + "grad_norm": 0.47296327352523804, + "learning_rate": 3.5623083173644306e-06, + "loss": 0.0176, + "step": 24480 + }, + { + "epoch": 1.467433638923842, + "grad_norm": 0.4550913870334625, + "learning_rate": 3.5568228935885963e-06, + "loss": 0.0167, + "step": 24490 + }, + { + "epoch": 1.4680328359997603, + "grad_norm": 0.38641592860221863, + "learning_rate": 3.5513462047956804e-06, + "loss": 0.0144, + "step": 24500 + }, + { + "epoch": 1.4686320330756786, + "grad_norm": 0.23746857047080994, + "learning_rate": 3.5458782574128837e-06, + "loss": 0.0167, + "step": 24510 + }, + { + "epoch": 1.4692312301515968, + "grad_norm": 0.2114812433719635, + "learning_rate": 3.5404190578571436e-06, + "loss": 0.0167, + "step": 24520 + }, + { + "epoch": 1.4698304272275151, + "grad_norm": 0.41703343391418457, + "learning_rate": 3.5349686125351386e-06, + "loss": 0.014, + "step": 24530 + }, + { + "epoch": 1.4704296243034334, + "grad_norm": 0.3279412090778351, + "learning_rate": 3.5295269278432664e-06, + "loss": 0.0158, + "step": 24540 + }, + { + "epoch": 1.4710288213793516, + "grad_norm": 0.41653862595558167, + "learning_rate": 3.524094010167648e-06, + "loss": 0.0191, + "step": 24550 + }, + { + "epoch": 1.47162801845527, + "grad_norm": 0.5392111539840698, + "learning_rate": 3.518669865884119e-06, + "loss": 0.0151, + "step": 24560 + }, + { + "epoch": 1.4722272155311882, + "grad_norm": 0.4654570519924164, + "learning_rate": 3.513254501358208e-06, + "loss": 0.0179, + "step": 24570 + }, + { + "epoch": 1.4728264126071064, + "grad_norm": 0.5389031171798706, + "learning_rate": 3.5078479229451512e-06, + "loss": 0.0139, + "step": 24580 + }, + { + "epoch": 1.4734256096830247, + "grad_norm": 0.38597020506858826, + "learning_rate": 3.502450136989869e-06, + "loss": 0.0142, + "step": 24590 + }, + { + "epoch": 1.474024806758943, + "grad_norm": 0.4820668399333954, + "learning_rate": 3.497061149826966e-06, + "loss": 0.015, + "step": 24600 + }, + { + "epoch": 1.4746240038348613, + "grad_norm": 0.36856982111930847, + "learning_rate": 3.4916809677807116e-06, + "loss": 0.0141, + "step": 24610 + }, + { + "epoch": 1.4752232009107795, + "grad_norm": 0.39727091789245605, + "learning_rate": 3.486309597165059e-06, + "loss": 0.0173, + "step": 24620 + }, + { + "epoch": 1.4758223979866978, + "grad_norm": 0.29800575971603394, + "learning_rate": 3.4809470442836023e-06, + "loss": 0.017, + "step": 24630 + }, + { + "epoch": 1.476421595062616, + "grad_norm": 0.6900123357772827, + "learning_rate": 3.4755933154295997e-06, + "loss": 0.0172, + "step": 24640 + }, + { + "epoch": 1.4770207921385343, + "grad_norm": 0.2665303647518158, + "learning_rate": 3.4702484168859517e-06, + "loss": 0.0157, + "step": 24650 + }, + { + "epoch": 1.4776199892144526, + "grad_norm": 0.3223106265068054, + "learning_rate": 3.4649123549251906e-06, + "loss": 0.016, + "step": 24660 + }, + { + "epoch": 1.4782191862903709, + "grad_norm": 0.3684261739253998, + "learning_rate": 3.4595851358094847e-06, + "loss": 0.02, + "step": 24670 + }, + { + "epoch": 1.4788183833662891, + "grad_norm": 0.38197198510169983, + "learning_rate": 3.454266765790622e-06, + "loss": 0.0169, + "step": 24680 + }, + { + "epoch": 1.4794175804422074, + "grad_norm": 0.35841095447540283, + "learning_rate": 3.448957251110008e-06, + "loss": 0.0175, + "step": 24690 + }, + { + "epoch": 1.4800167775181257, + "grad_norm": 0.4376572370529175, + "learning_rate": 3.443656597998649e-06, + "loss": 0.0156, + "step": 24700 + }, + { + "epoch": 1.480615974594044, + "grad_norm": 0.5526829361915588, + "learning_rate": 3.438364812677163e-06, + "loss": 0.0147, + "step": 24710 + }, + { + "epoch": 1.4812151716699622, + "grad_norm": 0.2922399938106537, + "learning_rate": 3.433081901355748e-06, + "loss": 0.0152, + "step": 24720 + }, + { + "epoch": 1.4818143687458805, + "grad_norm": 0.4333120882511139, + "learning_rate": 3.4278078702341987e-06, + "loss": 0.0147, + "step": 24730 + }, + { + "epoch": 1.4824135658217987, + "grad_norm": 0.26118189096450806, + "learning_rate": 3.422542725501885e-06, + "loss": 0.0166, + "step": 24740 + }, + { + "epoch": 1.483012762897717, + "grad_norm": 0.35313257575035095, + "learning_rate": 3.4172864733377453e-06, + "loss": 0.016, + "step": 24750 + }, + { + "epoch": 1.4836119599736353, + "grad_norm": 0.29923367500305176, + "learning_rate": 3.4120391199102854e-06, + "loss": 0.0201, + "step": 24760 + }, + { + "epoch": 1.4842111570495535, + "grad_norm": 0.434772253036499, + "learning_rate": 3.4068006713775668e-06, + "loss": 0.0193, + "step": 24770 + }, + { + "epoch": 1.4848103541254718, + "grad_norm": 0.3422386646270752, + "learning_rate": 3.4015711338872013e-06, + "loss": 0.0148, + "step": 24780 + }, + { + "epoch": 1.48540955120139, + "grad_norm": 0.4303880035877228, + "learning_rate": 3.396350513576341e-06, + "loss": 0.0197, + "step": 24790 + }, + { + "epoch": 1.4860087482773083, + "grad_norm": 0.4511156976222992, + "learning_rate": 3.391138816571675e-06, + "loss": 0.0172, + "step": 24800 + }, + { + "epoch": 1.4866079453532266, + "grad_norm": 0.22014041244983673, + "learning_rate": 3.3859360489894217e-06, + "loss": 0.0146, + "step": 24810 + }, + { + "epoch": 1.4872071424291449, + "grad_norm": 0.4387083351612091, + "learning_rate": 3.3807422169353153e-06, + "loss": 0.0173, + "step": 24820 + }, + { + "epoch": 1.4878063395050631, + "grad_norm": 0.44642165303230286, + "learning_rate": 3.375557326504612e-06, + "loss": 0.0175, + "step": 24830 + }, + { + "epoch": 1.4884055365809814, + "grad_norm": 0.39087313413619995, + "learning_rate": 3.3703813837820633e-06, + "loss": 0.0158, + "step": 24840 + }, + { + "epoch": 1.4890047336568997, + "grad_norm": 0.42447686195373535, + "learning_rate": 3.36521439484193e-06, + "loss": 0.0133, + "step": 24850 + }, + { + "epoch": 1.4896039307328182, + "grad_norm": 0.43447887897491455, + "learning_rate": 3.36005636574796e-06, + "loss": 0.017, + "step": 24860 + }, + { + "epoch": 1.4902031278087362, + "grad_norm": 0.3336028754711151, + "learning_rate": 3.354907302553392e-06, + "loss": 0.0159, + "step": 24870 + }, + { + "epoch": 1.4908023248846547, + "grad_norm": 0.3250858187675476, + "learning_rate": 3.349767211300933e-06, + "loss": 0.0169, + "step": 24880 + }, + { + "epoch": 1.4914015219605727, + "grad_norm": 0.2616746425628662, + "learning_rate": 3.3446360980227682e-06, + "loss": 0.0138, + "step": 24890 + }, + { + "epoch": 1.4920007190364912, + "grad_norm": 0.2752698063850403, + "learning_rate": 3.3395139687405463e-06, + "loss": 0.0144, + "step": 24900 + }, + { + "epoch": 1.4925999161124093, + "grad_norm": 0.28214627504348755, + "learning_rate": 3.3344008294653685e-06, + "loss": 0.0157, + "step": 24910 + }, + { + "epoch": 1.4931991131883278, + "grad_norm": 0.3839667737483978, + "learning_rate": 3.3292966861977933e-06, + "loss": 0.0153, + "step": 24920 + }, + { + "epoch": 1.4937983102642458, + "grad_norm": 0.29319512844085693, + "learning_rate": 3.324201544927811e-06, + "loss": 0.0165, + "step": 24930 + }, + { + "epoch": 1.4943975073401643, + "grad_norm": 0.4219116270542145, + "learning_rate": 3.319115411634857e-06, + "loss": 0.0173, + "step": 24940 + }, + { + "epoch": 1.4949967044160823, + "grad_norm": 0.4940520226955414, + "learning_rate": 3.3140382922877912e-06, + "loss": 0.0163, + "step": 24950 + }, + { + "epoch": 1.4955959014920008, + "grad_norm": 0.40064749121665955, + "learning_rate": 3.3089701928448987e-06, + "loss": 0.0146, + "step": 24960 + }, + { + "epoch": 1.4961950985679189, + "grad_norm": 0.33400869369506836, + "learning_rate": 3.303911119253872e-06, + "loss": 0.0165, + "step": 24970 + }, + { + "epoch": 1.4967942956438374, + "grad_norm": 0.2474612295627594, + "learning_rate": 3.298861077451818e-06, + "loss": 0.0154, + "step": 24980 + }, + { + "epoch": 1.4973934927197554, + "grad_norm": 0.32819071412086487, + "learning_rate": 3.293820073365243e-06, + "loss": 0.0189, + "step": 24990 + }, + { + "epoch": 1.497992689795674, + "grad_norm": 0.32721251249313354, + "learning_rate": 3.288788112910046e-06, + "loss": 0.0144, + "step": 25000 + }, + { + "epoch": 1.498591886871592, + "grad_norm": 0.4054602086544037, + "learning_rate": 3.2837652019915127e-06, + "loss": 0.017, + "step": 25010 + }, + { + "epoch": 1.4991910839475104, + "grad_norm": 0.4691202938556671, + "learning_rate": 3.2787513465043054e-06, + "loss": 0.0188, + "step": 25020 + }, + { + "epoch": 1.4997902810234285, + "grad_norm": 0.9318768382072449, + "learning_rate": 3.2737465523324663e-06, + "loss": 0.0192, + "step": 25030 + }, + { + "epoch": 1.500389478099347, + "grad_norm": 0.25441330671310425, + "learning_rate": 3.2687508253493987e-06, + "loss": 0.0143, + "step": 25040 + }, + { + "epoch": 1.500988675175265, + "grad_norm": 0.3425164520740509, + "learning_rate": 3.263764171417869e-06, + "loss": 0.0152, + "step": 25050 + }, + { + "epoch": 1.5015878722511835, + "grad_norm": 0.3809274733066559, + "learning_rate": 3.25878659638999e-06, + "loss": 0.0154, + "step": 25060 + }, + { + "epoch": 1.5021870693271016, + "grad_norm": 0.2595506012439728, + "learning_rate": 3.2538181061072245e-06, + "loss": 0.0144, + "step": 25070 + }, + { + "epoch": 1.50278626640302, + "grad_norm": 0.29121503233909607, + "learning_rate": 3.248858706400373e-06, + "loss": 0.0131, + "step": 25080 + }, + { + "epoch": 1.503385463478938, + "grad_norm": 0.2435981184244156, + "learning_rate": 3.2439084030895683e-06, + "loss": 0.0169, + "step": 25090 + }, + { + "epoch": 1.5039846605548566, + "grad_norm": 0.2967667579650879, + "learning_rate": 3.2389672019842664e-06, + "loss": 0.0146, + "step": 25100 + }, + { + "epoch": 1.5045838576307746, + "grad_norm": 0.2658415138721466, + "learning_rate": 3.23403510888324e-06, + "loss": 0.0164, + "step": 25110 + }, + { + "epoch": 1.5051830547066931, + "grad_norm": 0.25294387340545654, + "learning_rate": 3.229112129574576e-06, + "loss": 0.0137, + "step": 25120 + }, + { + "epoch": 1.5057822517826112, + "grad_norm": 0.4117964208126068, + "learning_rate": 3.224198269835665e-06, + "loss": 0.0175, + "step": 25130 + }, + { + "epoch": 1.5063814488585296, + "grad_norm": 0.22604988515377045, + "learning_rate": 3.219293535433198e-06, + "loss": 0.0187, + "step": 25140 + }, + { + "epoch": 1.5069806459344477, + "grad_norm": 0.2773517668247223, + "learning_rate": 3.214397932123149e-06, + "loss": 0.0176, + "step": 25150 + }, + { + "epoch": 1.5075798430103662, + "grad_norm": 0.3213720917701721, + "learning_rate": 3.209511465650783e-06, + "loss": 0.0181, + "step": 25160 + }, + { + "epoch": 1.5081790400862842, + "grad_norm": 0.3932463526725769, + "learning_rate": 3.2046341417506434e-06, + "loss": 0.0169, + "step": 25170 + }, + { + "epoch": 1.5087782371622027, + "grad_norm": 0.27642500400543213, + "learning_rate": 3.1997659661465395e-06, + "loss": 0.0177, + "step": 25180 + }, + { + "epoch": 1.5093774342381208, + "grad_norm": 0.4212909936904907, + "learning_rate": 3.1949069445515467e-06, + "loss": 0.0165, + "step": 25190 + }, + { + "epoch": 1.5099766313140393, + "grad_norm": 0.31928038597106934, + "learning_rate": 3.190057082667995e-06, + "loss": 0.0159, + "step": 25200 + }, + { + "epoch": 1.5105758283899573, + "grad_norm": 0.31685909628868103, + "learning_rate": 3.1852163861874704e-06, + "loss": 0.0146, + "step": 25210 + }, + { + "epoch": 1.5111750254658758, + "grad_norm": 0.22591470181941986, + "learning_rate": 3.1803848607907982e-06, + "loss": 0.0142, + "step": 25220 + }, + { + "epoch": 1.5117742225417938, + "grad_norm": 0.22344504296779633, + "learning_rate": 3.1755625121480454e-06, + "loss": 0.0127, + "step": 25230 + }, + { + "epoch": 1.5123734196177123, + "grad_norm": 0.4538969099521637, + "learning_rate": 3.1707493459185036e-06, + "loss": 0.0174, + "step": 25240 + }, + { + "epoch": 1.5129726166936306, + "grad_norm": 0.35422542691230774, + "learning_rate": 3.165945367750692e-06, + "loss": 0.0169, + "step": 25250 + }, + { + "epoch": 1.5135718137695489, + "grad_norm": 0.41911551356315613, + "learning_rate": 3.161150583282347e-06, + "loss": 0.0157, + "step": 25260 + }, + { + "epoch": 1.5141710108454671, + "grad_norm": 0.4679270088672638, + "learning_rate": 3.1563649981404167e-06, + "loss": 0.0147, + "step": 25270 + }, + { + "epoch": 1.5147702079213854, + "grad_norm": 0.29286396503448486, + "learning_rate": 3.1515886179410516e-06, + "loss": 0.0168, + "step": 25280 + }, + { + "epoch": 1.5153694049973037, + "grad_norm": 0.2840272784233093, + "learning_rate": 3.1468214482895963e-06, + "loss": 0.0182, + "step": 25290 + }, + { + "epoch": 1.515968602073222, + "grad_norm": 0.3369516432285309, + "learning_rate": 3.1420634947805924e-06, + "loss": 0.0159, + "step": 25300 + }, + { + "epoch": 1.5165677991491402, + "grad_norm": 0.36810392141342163, + "learning_rate": 3.1373147629977633e-06, + "loss": 0.0207, + "step": 25310 + }, + { + "epoch": 1.5171669962250585, + "grad_norm": 0.30844470858573914, + "learning_rate": 3.1325752585140136e-06, + "loss": 0.0151, + "step": 25320 + }, + { + "epoch": 1.5177661933009767, + "grad_norm": 0.22359415888786316, + "learning_rate": 3.127844986891409e-06, + "loss": 0.018, + "step": 25330 + }, + { + "epoch": 1.518365390376895, + "grad_norm": 0.42099806666374207, + "learning_rate": 3.123123953681191e-06, + "loss": 0.0158, + "step": 25340 + }, + { + "epoch": 1.5189645874528133, + "grad_norm": 0.2903825342655182, + "learning_rate": 3.1184121644237542e-06, + "loss": 0.0157, + "step": 25350 + }, + { + "epoch": 1.5195637845287315, + "grad_norm": 0.33182457089424133, + "learning_rate": 3.1137096246486474e-06, + "loss": 0.0179, + "step": 25360 + }, + { + "epoch": 1.5201629816046498, + "grad_norm": 0.4607376158237457, + "learning_rate": 3.1090163398745622e-06, + "loss": 0.0189, + "step": 25370 + }, + { + "epoch": 1.520762178680568, + "grad_norm": 0.21630525588989258, + "learning_rate": 3.1043323156093264e-06, + "loss": 0.0156, + "step": 25380 + }, + { + "epoch": 1.5213613757564863, + "grad_norm": 0.38443559408187866, + "learning_rate": 3.099657557349906e-06, + "loss": 0.0174, + "step": 25390 + }, + { + "epoch": 1.5219605728324046, + "grad_norm": 0.19618573784828186, + "learning_rate": 3.09499207058239e-06, + "loss": 0.0126, + "step": 25400 + }, + { + "epoch": 1.5225597699083229, + "grad_norm": 0.4141467809677124, + "learning_rate": 3.090335860781989e-06, + "loss": 0.0147, + "step": 25410 + }, + { + "epoch": 1.5231589669842411, + "grad_norm": 0.39915844798088074, + "learning_rate": 3.085688933413021e-06, + "loss": 0.0156, + "step": 25420 + }, + { + "epoch": 1.5237581640601594, + "grad_norm": 0.25136515498161316, + "learning_rate": 3.081051293928915e-06, + "loss": 0.0147, + "step": 25430 + }, + { + "epoch": 1.5243573611360777, + "grad_norm": 0.30357712507247925, + "learning_rate": 3.0764229477722004e-06, + "loss": 0.0153, + "step": 25440 + }, + { + "epoch": 1.524956558211996, + "grad_norm": 0.37422874569892883, + "learning_rate": 3.071803900374501e-06, + "loss": 0.0146, + "step": 25450 + }, + { + "epoch": 1.5255557552879142, + "grad_norm": 0.19593080878257751, + "learning_rate": 3.067194157156521e-06, + "loss": 0.0185, + "step": 25460 + }, + { + "epoch": 1.5261549523638325, + "grad_norm": 0.4984768033027649, + "learning_rate": 3.062593723528057e-06, + "loss": 0.0159, + "step": 25470 + }, + { + "epoch": 1.5267541494397507, + "grad_norm": 0.35011765360832214, + "learning_rate": 3.0580026048879687e-06, + "loss": 0.0171, + "step": 25480 + }, + { + "epoch": 1.527353346515669, + "grad_norm": 0.43658894300460815, + "learning_rate": 3.0534208066241914e-06, + "loss": 0.014, + "step": 25490 + }, + { + "epoch": 1.5279525435915873, + "grad_norm": 0.3372974693775177, + "learning_rate": 3.048848334113722e-06, + "loss": 0.0205, + "step": 25500 + }, + { + "epoch": 1.5285517406675055, + "grad_norm": 0.2942260205745697, + "learning_rate": 3.0442851927226105e-06, + "loss": 0.0166, + "step": 25510 + }, + { + "epoch": 1.5291509377434238, + "grad_norm": 0.43129920959472656, + "learning_rate": 3.0397313878059564e-06, + "loss": 0.0167, + "step": 25520 + }, + { + "epoch": 1.529750134819342, + "grad_norm": 0.3023529648780823, + "learning_rate": 3.0351869247079046e-06, + "loss": 0.0167, + "step": 25530 + }, + { + "epoch": 1.5303493318952603, + "grad_norm": 0.298043429851532, + "learning_rate": 3.030651808761638e-06, + "loss": 0.0185, + "step": 25540 + }, + { + "epoch": 1.5309485289711786, + "grad_norm": 0.2765754461288452, + "learning_rate": 3.0261260452893643e-06, + "loss": 0.0134, + "step": 25550 + }, + { + "epoch": 1.5315477260470969, + "grad_norm": 0.43460533022880554, + "learning_rate": 3.021609639602321e-06, + "loss": 0.014, + "step": 25560 + }, + { + "epoch": 1.5321469231230151, + "grad_norm": 0.2843260169029236, + "learning_rate": 3.0171025970007597e-06, + "loss": 0.0155, + "step": 25570 + }, + { + "epoch": 1.5327461201989334, + "grad_norm": 0.3337956964969635, + "learning_rate": 3.0126049227739463e-06, + "loss": 0.0164, + "step": 25580 + }, + { + "epoch": 1.5333453172748517, + "grad_norm": 0.4841095805168152, + "learning_rate": 3.008116622200155e-06, + "loss": 0.0147, + "step": 25590 + }, + { + "epoch": 1.53394451435077, + "grad_norm": 0.31032758951187134, + "learning_rate": 3.003637700546652e-06, + "loss": 0.015, + "step": 25600 + }, + { + "epoch": 1.5345437114266882, + "grad_norm": 0.4080669581890106, + "learning_rate": 2.9991681630697043e-06, + "loss": 0.0151, + "step": 25610 + }, + { + "epoch": 1.5351429085026065, + "grad_norm": 0.23705625534057617, + "learning_rate": 2.994708015014563e-06, + "loss": 0.0136, + "step": 25620 + }, + { + "epoch": 1.5357421055785248, + "grad_norm": 0.5293036103248596, + "learning_rate": 2.9902572616154608e-06, + "loss": 0.0195, + "step": 25630 + }, + { + "epoch": 1.536341302654443, + "grad_norm": 0.19166356325149536, + "learning_rate": 2.985815908095603e-06, + "loss": 0.0118, + "step": 25640 + }, + { + "epoch": 1.5369404997303613, + "grad_norm": 0.35923510789871216, + "learning_rate": 2.981383959667165e-06, + "loss": 0.0153, + "step": 25650 + }, + { + "epoch": 1.5375396968062796, + "grad_norm": 0.525636613368988, + "learning_rate": 2.9769614215312885e-06, + "loss": 0.0169, + "step": 25660 + }, + { + "epoch": 1.5381388938821978, + "grad_norm": 0.3833159804344177, + "learning_rate": 2.9725482988780636e-06, + "loss": 0.0155, + "step": 25670 + }, + { + "epoch": 1.538738090958116, + "grad_norm": 0.30203381180763245, + "learning_rate": 2.9681445968865403e-06, + "loss": 0.0163, + "step": 25680 + }, + { + "epoch": 1.5393372880340344, + "grad_norm": 0.5735456347465515, + "learning_rate": 2.963750320724704e-06, + "loss": 0.0158, + "step": 25690 + }, + { + "epoch": 1.5399364851099526, + "grad_norm": 0.4676662087440491, + "learning_rate": 2.9593654755494845e-06, + "loss": 0.0195, + "step": 25700 + }, + { + "epoch": 1.540535682185871, + "grad_norm": 0.29208818078041077, + "learning_rate": 2.954990066506741e-06, + "loss": 0.0165, + "step": 25710 + }, + { + "epoch": 1.5411348792617892, + "grad_norm": 0.3703807294368744, + "learning_rate": 2.9506240987312623e-06, + "loss": 0.015, + "step": 25720 + }, + { + "epoch": 1.5417340763377074, + "grad_norm": 0.5645684003829956, + "learning_rate": 2.9462675773467525e-06, + "loss": 0.0192, + "step": 25730 + }, + { + "epoch": 1.5423332734136257, + "grad_norm": 0.5154808759689331, + "learning_rate": 2.9419205074658314e-06, + "loss": 0.0154, + "step": 25740 + }, + { + "epoch": 1.542932470489544, + "grad_norm": 0.49836722016334534, + "learning_rate": 2.93758289419003e-06, + "loss": 0.0161, + "step": 25750 + }, + { + "epoch": 1.5435316675654622, + "grad_norm": 0.4711974561214447, + "learning_rate": 2.9332547426097768e-06, + "loss": 0.0143, + "step": 25760 + }, + { + "epoch": 1.5441308646413805, + "grad_norm": 0.3468717932701111, + "learning_rate": 2.9289360578044016e-06, + "loss": 0.0151, + "step": 25770 + }, + { + "epoch": 1.5447300617172988, + "grad_norm": 0.3216229975223541, + "learning_rate": 2.924626844842118e-06, + "loss": 0.0146, + "step": 25780 + }, + { + "epoch": 1.5453292587932173, + "grad_norm": 0.3436613976955414, + "learning_rate": 2.9203271087800287e-06, + "loss": 0.0172, + "step": 25790 + }, + { + "epoch": 1.5459284558691353, + "grad_norm": 0.3601810336112976, + "learning_rate": 2.916036854664115e-06, + "loss": 0.0166, + "step": 25800 + }, + { + "epoch": 1.5465276529450538, + "grad_norm": 0.2320292890071869, + "learning_rate": 2.911756087529229e-06, + "loss": 0.0156, + "step": 25810 + }, + { + "epoch": 1.5471268500209718, + "grad_norm": 0.4563167989253998, + "learning_rate": 2.907484812399086e-06, + "loss": 0.0168, + "step": 25820 + }, + { + "epoch": 1.5477260470968903, + "grad_norm": 0.33735397458076477, + "learning_rate": 2.9032230342862687e-06, + "loss": 0.0165, + "step": 25830 + }, + { + "epoch": 1.5483252441728084, + "grad_norm": 0.41785505414009094, + "learning_rate": 2.898970758192212e-06, + "loss": 0.0179, + "step": 25840 + }, + { + "epoch": 1.5489244412487269, + "grad_norm": 0.41172194480895996, + "learning_rate": 2.8947279891071935e-06, + "loss": 0.016, + "step": 25850 + }, + { + "epoch": 1.549523638324645, + "grad_norm": 0.4549838900566101, + "learning_rate": 2.8904947320103453e-06, + "loss": 0.0134, + "step": 25860 + }, + { + "epoch": 1.5501228354005634, + "grad_norm": 0.6315169930458069, + "learning_rate": 2.886270991869626e-06, + "loss": 0.0176, + "step": 25870 + }, + { + "epoch": 1.5507220324764814, + "grad_norm": 0.43143466114997864, + "learning_rate": 2.8820567736418296e-06, + "loss": 0.0166, + "step": 25880 + }, + { + "epoch": 1.5513212295524, + "grad_norm": 0.4559693932533264, + "learning_rate": 2.877852082272579e-06, + "loss": 0.0162, + "step": 25890 + }, + { + "epoch": 1.551920426628318, + "grad_norm": 0.3333865702152252, + "learning_rate": 2.8736569226963148e-06, + "loss": 0.0168, + "step": 25900 + }, + { + "epoch": 1.5525196237042365, + "grad_norm": 0.3939986526966095, + "learning_rate": 2.8694712998362858e-06, + "loss": 0.0146, + "step": 25910 + }, + { + "epoch": 1.5531188207801545, + "grad_norm": 0.35824787616729736, + "learning_rate": 2.865295218604555e-06, + "loss": 0.0179, + "step": 25920 + }, + { + "epoch": 1.553718017856073, + "grad_norm": 0.40517401695251465, + "learning_rate": 2.8611286839019884e-06, + "loss": 0.0158, + "step": 25930 + }, + { + "epoch": 1.554317214931991, + "grad_norm": 0.41149890422821045, + "learning_rate": 2.8569717006182487e-06, + "loss": 0.0142, + "step": 25940 + }, + { + "epoch": 1.5549164120079095, + "grad_norm": 0.22149957716464996, + "learning_rate": 2.852824273631779e-06, + "loss": 0.0153, + "step": 25950 + }, + { + "epoch": 1.5555156090838276, + "grad_norm": 0.2622004747390747, + "learning_rate": 2.8486864078098214e-06, + "loss": 0.0142, + "step": 25960 + }, + { + "epoch": 1.556114806159746, + "grad_norm": 0.3235580623149872, + "learning_rate": 2.8445581080083923e-06, + "loss": 0.014, + "step": 25970 + }, + { + "epoch": 1.5567140032356641, + "grad_norm": 0.4349730312824249, + "learning_rate": 2.8404393790722796e-06, + "loss": 0.0148, + "step": 25980 + }, + { + "epoch": 1.5573132003115826, + "grad_norm": 0.30583831667900085, + "learning_rate": 2.8363302258350433e-06, + "loss": 0.0148, + "step": 25990 + }, + { + "epoch": 1.5579123973875006, + "grad_norm": 0.3436671495437622, + "learning_rate": 2.832230653119002e-06, + "loss": 0.015, + "step": 26000 + }, + { + "epoch": 1.5585115944634191, + "grad_norm": 0.23681265115737915, + "learning_rate": 2.828140665735232e-06, + "loss": 0.0169, + "step": 26010 + }, + { + "epoch": 1.5591107915393372, + "grad_norm": 0.2916300892829895, + "learning_rate": 2.8240602684835614e-06, + "loss": 0.0145, + "step": 26020 + }, + { + "epoch": 1.5597099886152557, + "grad_norm": 0.4516601264476776, + "learning_rate": 2.8199894661525695e-06, + "loss": 0.0168, + "step": 26030 + }, + { + "epoch": 1.5603091856911737, + "grad_norm": 0.25640442967414856, + "learning_rate": 2.8159282635195604e-06, + "loss": 0.0145, + "step": 26040 + }, + { + "epoch": 1.5609083827670922, + "grad_norm": 0.3058616816997528, + "learning_rate": 2.8118766653505857e-06, + "loss": 0.0134, + "step": 26050 + }, + { + "epoch": 1.5615075798430103, + "grad_norm": 0.37286022305488586, + "learning_rate": 2.8078346764004217e-06, + "loss": 0.0133, + "step": 26060 + }, + { + "epoch": 1.5621067769189287, + "grad_norm": 0.2570302486419678, + "learning_rate": 2.8038023014125693e-06, + "loss": 0.0136, + "step": 26070 + }, + { + "epoch": 1.5627059739948468, + "grad_norm": 0.5596319437026978, + "learning_rate": 2.799779545119241e-06, + "loss": 0.0184, + "step": 26080 + }, + { + "epoch": 1.5633051710707653, + "grad_norm": 0.36270666122436523, + "learning_rate": 2.7957664122413685e-06, + "loss": 0.0158, + "step": 26090 + }, + { + "epoch": 1.5639043681466833, + "grad_norm": 0.4473365247249603, + "learning_rate": 2.7917629074885855e-06, + "loss": 0.0146, + "step": 26100 + }, + { + "epoch": 1.5645035652226018, + "grad_norm": 0.256773978471756, + "learning_rate": 2.78776903555923e-06, + "loss": 0.0141, + "step": 26110 + }, + { + "epoch": 1.5651027622985199, + "grad_norm": 0.3173777759075165, + "learning_rate": 2.7837848011403307e-06, + "loss": 0.0155, + "step": 26120 + }, + { + "epoch": 1.5657019593744383, + "grad_norm": 0.39649754762649536, + "learning_rate": 2.7798102089076096e-06, + "loss": 0.0171, + "step": 26130 + }, + { + "epoch": 1.5663011564503564, + "grad_norm": 0.8298602104187012, + "learning_rate": 2.7758452635254706e-06, + "loss": 0.0181, + "step": 26140 + }, + { + "epoch": 1.5669003535262749, + "grad_norm": 0.41698411107063293, + "learning_rate": 2.771889969647e-06, + "loss": 0.0155, + "step": 26150 + }, + { + "epoch": 1.567499550602193, + "grad_norm": 0.3315671384334564, + "learning_rate": 2.7679443319139547e-06, + "loss": 0.0142, + "step": 26160 + }, + { + "epoch": 1.5680987476781114, + "grad_norm": 0.27380600571632385, + "learning_rate": 2.76400835495676e-06, + "loss": 0.0146, + "step": 26170 + }, + { + "epoch": 1.5686979447540295, + "grad_norm": 0.2785346210002899, + "learning_rate": 2.760082043394504e-06, + "loss": 0.0174, + "step": 26180 + }, + { + "epoch": 1.569297141829948, + "grad_norm": 0.46294671297073364, + "learning_rate": 2.756165401834933e-06, + "loss": 0.0177, + "step": 26190 + }, + { + "epoch": 1.569896338905866, + "grad_norm": 0.3026588559150696, + "learning_rate": 2.7522584348744443e-06, + "loss": 0.016, + "step": 26200 + }, + { + "epoch": 1.5704955359817845, + "grad_norm": 0.335443377494812, + "learning_rate": 2.748361147098079e-06, + "loss": 0.0138, + "step": 26210 + }, + { + "epoch": 1.5710947330577025, + "grad_norm": 0.26176130771636963, + "learning_rate": 2.7444735430795245e-06, + "loss": 0.0182, + "step": 26220 + }, + { + "epoch": 1.571693930133621, + "grad_norm": 0.41030630469322205, + "learning_rate": 2.740595627381096e-06, + "loss": 0.0157, + "step": 26230 + }, + { + "epoch": 1.572293127209539, + "grad_norm": 0.25381243228912354, + "learning_rate": 2.7367274045537477e-06, + "loss": 0.0126, + "step": 26240 + }, + { + "epoch": 1.5728923242854576, + "grad_norm": 0.3790159821510315, + "learning_rate": 2.732868879137055e-06, + "loss": 0.0138, + "step": 26250 + }, + { + "epoch": 1.5734915213613756, + "grad_norm": 0.3830420672893524, + "learning_rate": 2.7290200556592094e-06, + "loss": 0.0134, + "step": 26260 + }, + { + "epoch": 1.574090718437294, + "grad_norm": 0.534146785736084, + "learning_rate": 2.72518093863702e-06, + "loss": 0.0151, + "step": 26270 + }, + { + "epoch": 1.5746899155132121, + "grad_norm": 0.5088993310928345, + "learning_rate": 2.721351532575906e-06, + "loss": 0.0188, + "step": 26280 + }, + { + "epoch": 1.5752891125891306, + "grad_norm": 0.271245539188385, + "learning_rate": 2.717531841969889e-06, + "loss": 0.015, + "step": 26290 + }, + { + "epoch": 1.5758883096650487, + "grad_norm": 0.7041701078414917, + "learning_rate": 2.713721871301588e-06, + "loss": 0.0205, + "step": 26300 + }, + { + "epoch": 1.5764875067409672, + "grad_norm": 1.5670353174209595, + "learning_rate": 2.709921625042214e-06, + "loss": 0.0246, + "step": 26310 + }, + { + "epoch": 1.5770867038168854, + "grad_norm": 0.3782089054584503, + "learning_rate": 2.7061311076515717e-06, + "loss": 0.0145, + "step": 26320 + }, + { + "epoch": 1.5776859008928037, + "grad_norm": 0.2301669716835022, + "learning_rate": 2.7023503235780395e-06, + "loss": 0.0132, + "step": 26330 + }, + { + "epoch": 1.578285097968722, + "grad_norm": 0.4629409611225128, + "learning_rate": 2.6985792772585826e-06, + "loss": 0.013, + "step": 26340 + }, + { + "epoch": 1.5788842950446402, + "grad_norm": 0.2709483802318573, + "learning_rate": 2.6948179731187315e-06, + "loss": 0.0154, + "step": 26350 + }, + { + "epoch": 1.5794834921205585, + "grad_norm": 0.31532853841781616, + "learning_rate": 2.6910664155725847e-06, + "loss": 0.016, + "step": 26360 + }, + { + "epoch": 1.5800826891964768, + "grad_norm": 0.350920170545578, + "learning_rate": 2.6873246090228063e-06, + "loss": 0.016, + "step": 26370 + }, + { + "epoch": 1.580681886272395, + "grad_norm": 0.5954864025115967, + "learning_rate": 2.683592557860616e-06, + "loss": 0.0178, + "step": 26380 + }, + { + "epoch": 1.5812810833483133, + "grad_norm": 0.4362819492816925, + "learning_rate": 2.6798702664657803e-06, + "loss": 0.017, + "step": 26390 + }, + { + "epoch": 1.5818802804242316, + "grad_norm": 0.2640637755393982, + "learning_rate": 2.6761577392066163e-06, + "loss": 0.0146, + "step": 26400 + }, + { + "epoch": 1.5824794775001498, + "grad_norm": 0.475008100271225, + "learning_rate": 2.6724549804399845e-06, + "loss": 0.0134, + "step": 26410 + }, + { + "epoch": 1.583078674576068, + "grad_norm": 0.27583909034729004, + "learning_rate": 2.6687619945112743e-06, + "loss": 0.014, + "step": 26420 + }, + { + "epoch": 1.5836778716519864, + "grad_norm": 0.392715722322464, + "learning_rate": 2.6650787857544134e-06, + "loss": 0.0183, + "step": 26430 + }, + { + "epoch": 1.5842770687279046, + "grad_norm": 0.19658122956752777, + "learning_rate": 2.661405358491851e-06, + "loss": 0.0172, + "step": 26440 + }, + { + "epoch": 1.584876265803823, + "grad_norm": 0.8701423406600952, + "learning_rate": 2.6577417170345594e-06, + "loss": 0.015, + "step": 26450 + }, + { + "epoch": 1.5854754628797412, + "grad_norm": 0.9331104159355164, + "learning_rate": 2.6540878656820246e-06, + "loss": 0.0152, + "step": 26460 + }, + { + "epoch": 1.5860746599556594, + "grad_norm": 0.29767271876335144, + "learning_rate": 2.6504438087222474e-06, + "loss": 0.0143, + "step": 26470 + }, + { + "epoch": 1.5866738570315777, + "grad_norm": 0.3449382781982422, + "learning_rate": 2.6468095504317275e-06, + "loss": 0.0151, + "step": 26480 + }, + { + "epoch": 1.587273054107496, + "grad_norm": 0.26225733757019043, + "learning_rate": 2.643185095075473e-06, + "loss": 0.0143, + "step": 26490 + }, + { + "epoch": 1.5878722511834142, + "grad_norm": 0.3581456243991852, + "learning_rate": 2.6395704469069837e-06, + "loss": 0.0138, + "step": 26500 + }, + { + "epoch": 1.5884714482593325, + "grad_norm": 0.246829554438591, + "learning_rate": 2.635965610168249e-06, + "loss": 0.0178, + "step": 26510 + }, + { + "epoch": 1.5890706453352508, + "grad_norm": 0.317020446062088, + "learning_rate": 2.6323705890897464e-06, + "loss": 0.0157, + "step": 26520 + }, + { + "epoch": 1.589669842411169, + "grad_norm": 0.3022174537181854, + "learning_rate": 2.628785387890433e-06, + "loss": 0.0132, + "step": 26530 + }, + { + "epoch": 1.5902690394870873, + "grad_norm": 0.26253461837768555, + "learning_rate": 2.6252100107777422e-06, + "loss": 0.0135, + "step": 26540 + }, + { + "epoch": 1.5908682365630056, + "grad_norm": 0.2757222056388855, + "learning_rate": 2.6216444619475786e-06, + "loss": 0.0138, + "step": 26550 + }, + { + "epoch": 1.5914674336389238, + "grad_norm": 0.3857184052467346, + "learning_rate": 2.6180887455843135e-06, + "loss": 0.013, + "step": 26560 + }, + { + "epoch": 1.5920666307148421, + "grad_norm": 0.4407658576965332, + "learning_rate": 2.6145428658607753e-06, + "loss": 0.0185, + "step": 26570 + }, + { + "epoch": 1.5926658277907604, + "grad_norm": 0.3413793444633484, + "learning_rate": 2.6110068269382534e-06, + "loss": 0.0135, + "step": 26580 + }, + { + "epoch": 1.5932650248666786, + "grad_norm": 0.24001765251159668, + "learning_rate": 2.6074806329664854e-06, + "loss": 0.0146, + "step": 26590 + }, + { + "epoch": 1.593864221942597, + "grad_norm": 0.4623468518257141, + "learning_rate": 2.6039642880836585e-06, + "loss": 0.015, + "step": 26600 + }, + { + "epoch": 1.5944634190185152, + "grad_norm": 0.32984790205955505, + "learning_rate": 2.600457796416397e-06, + "loss": 0.0159, + "step": 26610 + }, + { + "epoch": 1.5950626160944334, + "grad_norm": 0.31533241271972656, + "learning_rate": 2.5969611620797636e-06, + "loss": 0.0157, + "step": 26620 + }, + { + "epoch": 1.5956618131703517, + "grad_norm": 0.3851890563964844, + "learning_rate": 2.593474389177255e-06, + "loss": 0.0168, + "step": 26630 + }, + { + "epoch": 1.59626101024627, + "grad_norm": 0.41252562403678894, + "learning_rate": 2.5899974818007924e-06, + "loss": 0.0165, + "step": 26640 + }, + { + "epoch": 1.5968602073221883, + "grad_norm": 0.473445326089859, + "learning_rate": 2.586530444030723e-06, + "loss": 0.0123, + "step": 26650 + }, + { + "epoch": 1.5974594043981065, + "grad_norm": 0.3054860532283783, + "learning_rate": 2.583073279935805e-06, + "loss": 0.014, + "step": 26660 + }, + { + "epoch": 1.5980586014740248, + "grad_norm": 0.28879237174987793, + "learning_rate": 2.5796259935732143e-06, + "loss": 0.0171, + "step": 26670 + }, + { + "epoch": 1.598657798549943, + "grad_norm": 0.32456526160240173, + "learning_rate": 2.5761885889885346e-06, + "loss": 0.0143, + "step": 26680 + }, + { + "epoch": 1.5992569956258613, + "grad_norm": 0.5708281993865967, + "learning_rate": 2.5727610702157518e-06, + "loss": 0.0163, + "step": 26690 + }, + { + "epoch": 1.5998561927017796, + "grad_norm": 0.6487006545066833, + "learning_rate": 2.5693434412772496e-06, + "loss": 0.0169, + "step": 26700 + }, + { + "epoch": 1.6004553897776979, + "grad_norm": 0.3364347517490387, + "learning_rate": 2.565935706183804e-06, + "loss": 0.018, + "step": 26710 + }, + { + "epoch": 1.6010545868536161, + "grad_norm": 0.41275516152381897, + "learning_rate": 2.5625378689345837e-06, + "loss": 0.0153, + "step": 26720 + }, + { + "epoch": 1.6016537839295344, + "grad_norm": 0.391722708940506, + "learning_rate": 2.5591499335171394e-06, + "loss": 0.0161, + "step": 26730 + }, + { + "epoch": 1.6022529810054527, + "grad_norm": 0.3787323534488678, + "learning_rate": 2.555771903907403e-06, + "loss": 0.0174, + "step": 26740 + }, + { + "epoch": 1.602852178081371, + "grad_norm": 0.3075166940689087, + "learning_rate": 2.5524037840696787e-06, + "loss": 0.0145, + "step": 26750 + }, + { + "epoch": 1.6034513751572892, + "grad_norm": 0.3613744080066681, + "learning_rate": 2.5490455779566446e-06, + "loss": 0.0154, + "step": 26760 + }, + { + "epoch": 1.6040505722332075, + "grad_norm": 0.34713929891586304, + "learning_rate": 2.545697289509341e-06, + "loss": 0.0114, + "step": 26770 + }, + { + "epoch": 1.6046497693091257, + "grad_norm": 0.4100549519062042, + "learning_rate": 2.5423589226571733e-06, + "loss": 0.013, + "step": 26780 + }, + { + "epoch": 1.605248966385044, + "grad_norm": 0.3897320330142975, + "learning_rate": 2.5390304813179e-06, + "loss": 0.016, + "step": 26790 + }, + { + "epoch": 1.6058481634609623, + "grad_norm": 0.3584144413471222, + "learning_rate": 2.5357119693976297e-06, + "loss": 0.015, + "step": 26800 + }, + { + "epoch": 1.6064473605368805, + "grad_norm": 0.31220853328704834, + "learning_rate": 2.532403390790823e-06, + "loss": 0.0131, + "step": 26810 + }, + { + "epoch": 1.6070465576127988, + "grad_norm": 0.3192695379257202, + "learning_rate": 2.529104749380281e-06, + "loss": 0.0133, + "step": 26820 + }, + { + "epoch": 1.607645754688717, + "grad_norm": 0.30283334851264954, + "learning_rate": 2.5258160490371446e-06, + "loss": 0.0122, + "step": 26830 + }, + { + "epoch": 1.6082449517646353, + "grad_norm": 0.282143771648407, + "learning_rate": 2.5225372936208854e-06, + "loss": 0.0138, + "step": 26840 + }, + { + "epoch": 1.6088441488405536, + "grad_norm": 0.43043816089630127, + "learning_rate": 2.5192684869793043e-06, + "loss": 0.0155, + "step": 26850 + }, + { + "epoch": 1.609443345916472, + "grad_norm": 0.2672103941440582, + "learning_rate": 2.51600963294853e-06, + "loss": 0.0153, + "step": 26860 + }, + { + "epoch": 1.6100425429923901, + "grad_norm": 0.39164942502975464, + "learning_rate": 2.5127607353530097e-06, + "loss": 0.0145, + "step": 26870 + }, + { + "epoch": 1.6106417400683086, + "grad_norm": 0.33121028542518616, + "learning_rate": 2.5095217980055052e-06, + "loss": 0.014, + "step": 26880 + }, + { + "epoch": 1.6112409371442267, + "grad_norm": 0.46786385774612427, + "learning_rate": 2.5062928247070873e-06, + "loss": 0.0144, + "step": 26890 + }, + { + "epoch": 1.6118401342201452, + "grad_norm": 0.4348220229148865, + "learning_rate": 2.503073819247138e-06, + "loss": 0.0136, + "step": 26900 + }, + { + "epoch": 1.6124393312960632, + "grad_norm": 0.7225855588912964, + "learning_rate": 2.4998647854033393e-06, + "loss": 0.0159, + "step": 26910 + }, + { + "epoch": 1.6130385283719817, + "grad_norm": 0.540884256362915, + "learning_rate": 2.4966657269416738e-06, + "loss": 0.0153, + "step": 26920 + }, + { + "epoch": 1.6136377254478997, + "grad_norm": 0.2984727919101715, + "learning_rate": 2.49347664761641e-06, + "loss": 0.0127, + "step": 26930 + }, + { + "epoch": 1.6142369225238182, + "grad_norm": 0.34762996435165405, + "learning_rate": 2.490297551170112e-06, + "loss": 0.0181, + "step": 26940 + }, + { + "epoch": 1.6148361195997363, + "grad_norm": 0.4229494035243988, + "learning_rate": 2.487128441333628e-06, + "loss": 0.0184, + "step": 26950 + }, + { + "epoch": 1.6154353166756548, + "grad_norm": 0.4511129558086395, + "learning_rate": 2.4839693218260844e-06, + "loss": 0.0136, + "step": 26960 + }, + { + "epoch": 1.6160345137515728, + "grad_norm": 0.20887398719787598, + "learning_rate": 2.4808201963548844e-06, + "loss": 0.0136, + "step": 26970 + }, + { + "epoch": 1.6166337108274913, + "grad_norm": 0.27858126163482666, + "learning_rate": 2.477681068615698e-06, + "loss": 0.0128, + "step": 26980 + }, + { + "epoch": 1.6172329079034093, + "grad_norm": 0.32049617171287537, + "learning_rate": 2.4745519422924715e-06, + "loss": 0.013, + "step": 26990 + }, + { + "epoch": 1.6178321049793278, + "grad_norm": 0.4276943802833557, + "learning_rate": 2.471432821057406e-06, + "loss": 0.0152, + "step": 27000 + }, + { + "epoch": 1.6184313020552459, + "grad_norm": 0.29610252380371094, + "learning_rate": 2.4683237085709673e-06, + "loss": 0.0122, + "step": 27010 + }, + { + "epoch": 1.6190304991311644, + "grad_norm": 0.24043124914169312, + "learning_rate": 2.4652246084818678e-06, + "loss": 0.0116, + "step": 27020 + }, + { + "epoch": 1.6196296962070824, + "grad_norm": 0.33894526958465576, + "learning_rate": 2.4621355244270764e-06, + "loss": 0.0119, + "step": 27030 + }, + { + "epoch": 1.620228893283001, + "grad_norm": 0.2597903609275818, + "learning_rate": 2.4590564600318047e-06, + "loss": 0.0144, + "step": 27040 + }, + { + "epoch": 1.620828090358919, + "grad_norm": 0.4067903459072113, + "learning_rate": 2.4559874189095077e-06, + "loss": 0.0137, + "step": 27050 + }, + { + "epoch": 1.6214272874348374, + "grad_norm": 0.48484402894973755, + "learning_rate": 2.4529284046618745e-06, + "loss": 0.0147, + "step": 27060 + }, + { + "epoch": 1.6220264845107555, + "grad_norm": 0.52725750207901, + "learning_rate": 2.4498794208788296e-06, + "loss": 0.0175, + "step": 27070 + }, + { + "epoch": 1.622625681586674, + "grad_norm": 0.23465880751609802, + "learning_rate": 2.446840471138524e-06, + "loss": 0.0123, + "step": 27080 + }, + { + "epoch": 1.623224878662592, + "grad_norm": 0.4273434579372406, + "learning_rate": 2.443811559007335e-06, + "loss": 0.015, + "step": 27090 + }, + { + "epoch": 1.6238240757385105, + "grad_norm": 0.2985517680644989, + "learning_rate": 2.440792688039862e-06, + "loss": 0.013, + "step": 27100 + }, + { + "epoch": 1.6244232728144286, + "grad_norm": 0.4334832727909088, + "learning_rate": 2.437783861778914e-06, + "loss": 0.0113, + "step": 27110 + }, + { + "epoch": 1.625022469890347, + "grad_norm": 0.2899027466773987, + "learning_rate": 2.4347850837555197e-06, + "loss": 0.0153, + "step": 27120 + }, + { + "epoch": 1.625621666966265, + "grad_norm": 0.35197123885154724, + "learning_rate": 2.4317963574889108e-06, + "loss": 0.0143, + "step": 27130 + }, + { + "epoch": 1.6262208640421836, + "grad_norm": 0.25402888655662537, + "learning_rate": 2.428817686486524e-06, + "loss": 0.0152, + "step": 27140 + }, + { + "epoch": 1.6268200611181016, + "grad_norm": 0.49205178022384644, + "learning_rate": 2.425849074243997e-06, + "loss": 0.014, + "step": 27150 + }, + { + "epoch": 1.6274192581940201, + "grad_norm": 0.2541142404079437, + "learning_rate": 2.4228905242451593e-06, + "loss": 0.0134, + "step": 27160 + }, + { + "epoch": 1.6280184552699382, + "grad_norm": 0.4348624646663666, + "learning_rate": 2.419942039962035e-06, + "loss": 0.0126, + "step": 27170 + }, + { + "epoch": 1.6286176523458566, + "grad_norm": 0.33341577649116516, + "learning_rate": 2.4170036248548345e-06, + "loss": 0.0149, + "step": 27180 + }, + { + "epoch": 1.6292168494217747, + "grad_norm": 0.394909143447876, + "learning_rate": 2.414075282371954e-06, + "loss": 0.0146, + "step": 27190 + }, + { + "epoch": 1.6298160464976932, + "grad_norm": 0.47289931774139404, + "learning_rate": 2.411157015949963e-06, + "loss": 0.0165, + "step": 27200 + }, + { + "epoch": 1.6304152435736112, + "grad_norm": 0.45220911502838135, + "learning_rate": 2.408248829013611e-06, + "loss": 0.0116, + "step": 27210 + }, + { + "epoch": 1.6310144406495297, + "grad_norm": 0.36566999554634094, + "learning_rate": 2.4053507249758174e-06, + "loss": 0.0158, + "step": 27220 + }, + { + "epoch": 1.6316136377254478, + "grad_norm": 0.26231661438941956, + "learning_rate": 2.40246270723767e-06, + "loss": 0.0131, + "step": 27230 + }, + { + "epoch": 1.6322128348013663, + "grad_norm": 0.32366135716438293, + "learning_rate": 2.399584779188417e-06, + "loss": 0.0131, + "step": 27240 + }, + { + "epoch": 1.6328120318772843, + "grad_norm": 0.3068046271800995, + "learning_rate": 2.396716944205467e-06, + "loss": 0.0123, + "step": 27250 + }, + { + "epoch": 1.6334112289532028, + "grad_norm": 0.28027409315109253, + "learning_rate": 2.3938592056543853e-06, + "loss": 0.013, + "step": 27260 + }, + { + "epoch": 1.6340104260291208, + "grad_norm": 0.3580668270587921, + "learning_rate": 2.391011566888887e-06, + "loss": 0.0133, + "step": 27270 + }, + { + "epoch": 1.6346096231050393, + "grad_norm": 0.42907601594924927, + "learning_rate": 2.3881740312508346e-06, + "loss": 0.0148, + "step": 27280 + }, + { + "epoch": 1.6352088201809574, + "grad_norm": 0.2437274307012558, + "learning_rate": 2.3853466020702323e-06, + "loss": 0.014, + "step": 27290 + }, + { + "epoch": 1.6358080172568759, + "grad_norm": 0.3689195513725281, + "learning_rate": 2.382529282665229e-06, + "loss": 0.0113, + "step": 27300 + }, + { + "epoch": 1.636407214332794, + "grad_norm": 0.48261409997940063, + "learning_rate": 2.379722076342103e-06, + "loss": 0.0141, + "step": 27310 + }, + { + "epoch": 1.6370064114087124, + "grad_norm": 0.3526110351085663, + "learning_rate": 2.376924986395271e-06, + "loss": 0.018, + "step": 27320 + }, + { + "epoch": 1.6376056084846304, + "grad_norm": 0.23795528709888458, + "learning_rate": 2.37413801610727e-06, + "loss": 0.0154, + "step": 27330 + }, + { + "epoch": 1.638204805560549, + "grad_norm": 0.40328165888786316, + "learning_rate": 2.371361168748767e-06, + "loss": 0.0128, + "step": 27340 + }, + { + "epoch": 1.638804002636467, + "grad_norm": 0.4420272409915924, + "learning_rate": 2.3685944475785463e-06, + "loss": 0.0137, + "step": 27350 + }, + { + "epoch": 1.6394031997123855, + "grad_norm": 0.23652666807174683, + "learning_rate": 2.3658378558435098e-06, + "loss": 0.014, + "step": 27360 + }, + { + "epoch": 1.6400023967883035, + "grad_norm": 0.3468151390552521, + "learning_rate": 2.363091396778672e-06, + "loss": 0.0155, + "step": 27370 + }, + { + "epoch": 1.640601593864222, + "grad_norm": 0.35930299758911133, + "learning_rate": 2.3603550736071535e-06, + "loss": 0.014, + "step": 27380 + }, + { + "epoch": 1.6412007909401403, + "grad_norm": 0.19394037127494812, + "learning_rate": 2.357628889540182e-06, + "loss": 0.0148, + "step": 27390 + }, + { + "epoch": 1.6417999880160585, + "grad_norm": 0.35877296328544617, + "learning_rate": 2.3549128477770894e-06, + "loss": 0.0136, + "step": 27400 + }, + { + "epoch": 1.6423991850919768, + "grad_norm": 0.29156941175460815, + "learning_rate": 2.3522069515052996e-06, + "loss": 0.0128, + "step": 27410 + }, + { + "epoch": 1.642998382167895, + "grad_norm": 0.3780912756919861, + "learning_rate": 2.349511203900333e-06, + "loss": 0.015, + "step": 27420 + }, + { + "epoch": 1.6435975792438133, + "grad_norm": 0.3290363848209381, + "learning_rate": 2.3468256081258e-06, + "loss": 0.0152, + "step": 27430 + }, + { + "epoch": 1.6441967763197316, + "grad_norm": 0.5973288416862488, + "learning_rate": 2.344150167333397e-06, + "loss": 0.015, + "step": 27440 + }, + { + "epoch": 1.6447959733956499, + "grad_norm": 0.4506072402000427, + "learning_rate": 2.3414848846629013e-06, + "loss": 0.0146, + "step": 27450 + }, + { + "epoch": 1.6453951704715681, + "grad_norm": 0.32139888405799866, + "learning_rate": 2.3388297632421727e-06, + "loss": 0.0168, + "step": 27460 + }, + { + "epoch": 1.6459943675474864, + "grad_norm": 0.3994857370853424, + "learning_rate": 2.3361848061871417e-06, + "loss": 0.0152, + "step": 27470 + }, + { + "epoch": 1.6465935646234047, + "grad_norm": 0.26820749044418335, + "learning_rate": 2.333550016601814e-06, + "loss": 0.0134, + "step": 27480 + }, + { + "epoch": 1.647192761699323, + "grad_norm": 0.3729577958583832, + "learning_rate": 2.3309253975782623e-06, + "loss": 0.0162, + "step": 27490 + }, + { + "epoch": 1.6477919587752412, + "grad_norm": 0.24220766127109528, + "learning_rate": 2.3283109521966236e-06, + "loss": 0.0138, + "step": 27500 + }, + { + "epoch": 1.6483911558511595, + "grad_norm": 0.49408698081970215, + "learning_rate": 2.325706683525094e-06, + "loss": 0.017, + "step": 27510 + }, + { + "epoch": 1.6489903529270777, + "grad_norm": 0.22594054043293, + "learning_rate": 2.3231125946199277e-06, + "loss": 0.0148, + "step": 27520 + }, + { + "epoch": 1.649589550002996, + "grad_norm": 0.41143184900283813, + "learning_rate": 2.320528688525433e-06, + "loss": 0.0152, + "step": 27530 + }, + { + "epoch": 1.6501887470789143, + "grad_norm": 0.3367273509502411, + "learning_rate": 2.317954968273969e-06, + "loss": 0.0138, + "step": 27540 + }, + { + "epoch": 1.6507879441548325, + "grad_norm": 0.6019514203071594, + "learning_rate": 2.3153914368859386e-06, + "loss": 0.0168, + "step": 27550 + }, + { + "epoch": 1.6513871412307508, + "grad_norm": 0.5941750407218933, + "learning_rate": 2.3128380973697868e-06, + "loss": 0.013, + "step": 27560 + }, + { + "epoch": 1.651986338306669, + "grad_norm": 0.43502920866012573, + "learning_rate": 2.3102949527220025e-06, + "loss": 0.0134, + "step": 27570 + }, + { + "epoch": 1.6525855353825873, + "grad_norm": 0.32287806272506714, + "learning_rate": 2.3077620059271054e-06, + "loss": 0.0131, + "step": 27580 + }, + { + "epoch": 1.6531847324585056, + "grad_norm": 0.4743358790874481, + "learning_rate": 2.305239259957653e-06, + "loss": 0.0198, + "step": 27590 + }, + { + "epoch": 1.6537839295344239, + "grad_norm": 0.29685747623443604, + "learning_rate": 2.302726717774224e-06, + "loss": 0.0147, + "step": 27600 + }, + { + "epoch": 1.6543831266103421, + "grad_norm": 0.4355921447277069, + "learning_rate": 2.3002243823254294e-06, + "loss": 0.0136, + "step": 27610 + }, + { + "epoch": 1.6549823236862604, + "grad_norm": 0.4096180498600006, + "learning_rate": 2.2977322565478988e-06, + "loss": 0.013, + "step": 27620 + }, + { + "epoch": 1.6555815207621787, + "grad_norm": 0.3704766631126404, + "learning_rate": 2.2952503433662806e-06, + "loss": 0.0152, + "step": 27630 + }, + { + "epoch": 1.656180717838097, + "grad_norm": 0.4177798628807068, + "learning_rate": 2.2927786456932383e-06, + "loss": 0.0147, + "step": 27640 + }, + { + "epoch": 1.6567799149140152, + "grad_norm": 0.32486793398857117, + "learning_rate": 2.2903171664294446e-06, + "loss": 0.0125, + "step": 27650 + }, + { + "epoch": 1.6573791119899335, + "grad_norm": 0.3335772752761841, + "learning_rate": 2.287865908463585e-06, + "loss": 0.0155, + "step": 27660 + }, + { + "epoch": 1.6579783090658518, + "grad_norm": 0.4169732332229614, + "learning_rate": 2.2854248746723464e-06, + "loss": 0.0153, + "step": 27670 + }, + { + "epoch": 1.65857750614177, + "grad_norm": 0.2390674203634262, + "learning_rate": 2.2829940679204192e-06, + "loss": 0.0161, + "step": 27680 + }, + { + "epoch": 1.6591767032176883, + "grad_norm": 0.41580212116241455, + "learning_rate": 2.280573491060488e-06, + "loss": 0.0116, + "step": 27690 + }, + { + "epoch": 1.6597759002936066, + "grad_norm": 0.3981385827064514, + "learning_rate": 2.278163146933236e-06, + "loss": 0.013, + "step": 27700 + }, + { + "epoch": 1.6603750973695248, + "grad_norm": 0.3737584948539734, + "learning_rate": 2.275763038367336e-06, + "loss": 0.011, + "step": 27710 + }, + { + "epoch": 1.660974294445443, + "grad_norm": 0.2370023876428604, + "learning_rate": 2.2733731681794505e-06, + "loss": 0.0173, + "step": 27720 + }, + { + "epoch": 1.6615734915213614, + "grad_norm": 0.6599531769752502, + "learning_rate": 2.270993539174225e-06, + "loss": 0.0145, + "step": 27730 + }, + { + "epoch": 1.6621726885972796, + "grad_norm": 0.3255928158760071, + "learning_rate": 2.268624154144283e-06, + "loss": 0.0149, + "step": 27740 + }, + { + "epoch": 1.662771885673198, + "grad_norm": 0.28063544631004333, + "learning_rate": 2.266265015870234e-06, + "loss": 0.0157, + "step": 27750 + }, + { + "epoch": 1.6633710827491162, + "grad_norm": 0.300642192363739, + "learning_rate": 2.2639161271206562e-06, + "loss": 0.0139, + "step": 27760 + }, + { + "epoch": 1.6639702798250344, + "grad_norm": 0.3485228717327118, + "learning_rate": 2.261577490652103e-06, + "loss": 0.0139, + "step": 27770 + }, + { + "epoch": 1.6645694769009527, + "grad_norm": 0.31508076190948486, + "learning_rate": 2.259249109209093e-06, + "loss": 0.0162, + "step": 27780 + }, + { + "epoch": 1.665168673976871, + "grad_norm": 0.4764767587184906, + "learning_rate": 2.256930985524111e-06, + "loss": 0.0145, + "step": 27790 + }, + { + "epoch": 1.6657678710527892, + "grad_norm": 0.26427552103996277, + "learning_rate": 2.2546231223176062e-06, + "loss": 0.013, + "step": 27800 + }, + { + "epoch": 1.6663670681287075, + "grad_norm": 0.5152391791343689, + "learning_rate": 2.2523255222979846e-06, + "loss": 0.015, + "step": 27810 + }, + { + "epoch": 1.6669662652046258, + "grad_norm": 0.4326762855052948, + "learning_rate": 2.2500381881616064e-06, + "loss": 0.0111, + "step": 27820 + }, + { + "epoch": 1.667565462280544, + "grad_norm": 0.3035188913345337, + "learning_rate": 2.2477611225927847e-06, + "loss": 0.0141, + "step": 27830 + }, + { + "epoch": 1.6681646593564623, + "grad_norm": 0.49474793672561646, + "learning_rate": 2.2454943282637852e-06, + "loss": 0.0129, + "step": 27840 + }, + { + "epoch": 1.6687638564323806, + "grad_norm": 0.46236565709114075, + "learning_rate": 2.2432378078348166e-06, + "loss": 0.016, + "step": 27850 + }, + { + "epoch": 1.6693630535082988, + "grad_norm": 0.31711387634277344, + "learning_rate": 2.2409915639540295e-06, + "loss": 0.013, + "step": 27860 + }, + { + "epoch": 1.669962250584217, + "grad_norm": 0.4073173701763153, + "learning_rate": 2.2387555992575192e-06, + "loss": 0.0124, + "step": 27870 + }, + { + "epoch": 1.6705614476601354, + "grad_norm": 0.3320833742618561, + "learning_rate": 2.236529916369313e-06, + "loss": 0.0172, + "step": 27880 + }, + { + "epoch": 1.6711606447360536, + "grad_norm": 0.4608694314956665, + "learning_rate": 2.2343145179013726e-06, + "loss": 0.0143, + "step": 27890 + }, + { + "epoch": 1.671759841811972, + "grad_norm": 0.9055055975914001, + "learning_rate": 2.232109406453595e-06, + "loss": 0.017, + "step": 27900 + }, + { + "epoch": 1.6723590388878904, + "grad_norm": 0.19240455329418182, + "learning_rate": 2.229914584613798e-06, + "loss": 0.0128, + "step": 27910 + }, + { + "epoch": 1.6729582359638084, + "grad_norm": 0.2756566107273102, + "learning_rate": 2.22773005495773e-06, + "loss": 0.0157, + "step": 27920 + }, + { + "epoch": 1.673557433039727, + "grad_norm": 0.47067585587501526, + "learning_rate": 2.2255558200490557e-06, + "loss": 0.0132, + "step": 27930 + }, + { + "epoch": 1.674156630115645, + "grad_norm": 0.421377032995224, + "learning_rate": 2.2233918824393625e-06, + "loss": 0.0137, + "step": 27940 + }, + { + "epoch": 1.6747558271915635, + "grad_norm": 0.437125563621521, + "learning_rate": 2.221238244668151e-06, + "loss": 0.0119, + "step": 27950 + }, + { + "epoch": 1.6753550242674815, + "grad_norm": 0.3617478311061859, + "learning_rate": 2.219094909262834e-06, + "loss": 0.0159, + "step": 27960 + }, + { + "epoch": 1.6759542213434, + "grad_norm": 0.39676180481910706, + "learning_rate": 2.2169618787387374e-06, + "loss": 0.0099, + "step": 27970 + }, + { + "epoch": 1.676553418419318, + "grad_norm": 0.24751955270767212, + "learning_rate": 2.2148391555990905e-06, + "loss": 0.0121, + "step": 27980 + }, + { + "epoch": 1.6771526154952365, + "grad_norm": 0.5263744592666626, + "learning_rate": 2.212726742335025e-06, + "loss": 0.0144, + "step": 27990 + }, + { + "epoch": 1.6777518125711546, + "grad_norm": 0.28027406334877014, + "learning_rate": 2.210624641425579e-06, + "loss": 0.0119, + "step": 28000 + }, + { + "epoch": 1.678351009647073, + "grad_norm": 0.37766972184181213, + "learning_rate": 2.208532855337684e-06, + "loss": 0.014, + "step": 28010 + }, + { + "epoch": 1.6789502067229911, + "grad_norm": 0.5175389051437378, + "learning_rate": 2.2064513865261646e-06, + "loss": 0.016, + "step": 28020 + }, + { + "epoch": 1.6795494037989096, + "grad_norm": 0.2620721459388733, + "learning_rate": 2.204380237433745e-06, + "loss": 0.0141, + "step": 28030 + }, + { + "epoch": 1.6801486008748276, + "grad_norm": 0.532120406627655, + "learning_rate": 2.202319410491029e-06, + "loss": 0.019, + "step": 28040 + }, + { + "epoch": 1.6807477979507461, + "grad_norm": 0.3872573971748352, + "learning_rate": 2.2002689081165155e-06, + "loss": 0.013, + "step": 28050 + }, + { + "epoch": 1.6813469950266642, + "grad_norm": 0.5482046008110046, + "learning_rate": 2.1982287327165827e-06, + "loss": 0.0121, + "step": 28060 + }, + { + "epoch": 1.6819461921025827, + "grad_norm": 0.2698966860771179, + "learning_rate": 2.19619888668549e-06, + "loss": 0.0154, + "step": 28070 + }, + { + "epoch": 1.6825453891785007, + "grad_norm": 0.5507254004478455, + "learning_rate": 2.1941793724053733e-06, + "loss": 0.0159, + "step": 28080 + }, + { + "epoch": 1.6831445862544192, + "grad_norm": 0.2223939299583435, + "learning_rate": 2.1921701922462463e-06, + "loss": 0.0165, + "step": 28090 + }, + { + "epoch": 1.6837437833303373, + "grad_norm": 0.2616906762123108, + "learning_rate": 2.190171348565994e-06, + "loss": 0.0162, + "step": 28100 + }, + { + "epoch": 1.6843429804062557, + "grad_norm": 0.23425602912902832, + "learning_rate": 2.188182843710369e-06, + "loss": 0.0142, + "step": 28110 + }, + { + "epoch": 1.6849421774821738, + "grad_norm": 0.6166255474090576, + "learning_rate": 2.1862046800129964e-06, + "loss": 0.0113, + "step": 28120 + }, + { + "epoch": 1.6855413745580923, + "grad_norm": 0.5097243189811707, + "learning_rate": 2.1842368597953578e-06, + "loss": 0.0117, + "step": 28130 + }, + { + "epoch": 1.6861405716340103, + "grad_norm": 0.371737003326416, + "learning_rate": 2.1822793853668e-06, + "loss": 0.0145, + "step": 28140 + }, + { + "epoch": 1.6867397687099288, + "grad_norm": 0.4312809407711029, + "learning_rate": 2.18033225902453e-06, + "loss": 0.0187, + "step": 28150 + }, + { + "epoch": 1.6873389657858469, + "grad_norm": 0.22457192838191986, + "learning_rate": 2.17839548305361e-06, + "loss": 0.0166, + "step": 28160 + }, + { + "epoch": 1.6879381628617653, + "grad_norm": 0.380092978477478, + "learning_rate": 2.1764690597269507e-06, + "loss": 0.0147, + "step": 28170 + }, + { + "epoch": 1.6885373599376834, + "grad_norm": 0.3026501536369324, + "learning_rate": 2.17455299130532e-06, + "loss": 0.0126, + "step": 28180 + }, + { + "epoch": 1.6891365570136019, + "grad_norm": 0.2680145800113678, + "learning_rate": 2.17264728003733e-06, + "loss": 0.0186, + "step": 28190 + }, + { + "epoch": 1.68973575408952, + "grad_norm": 0.36782440543174744, + "learning_rate": 2.17075192815944e-06, + "loss": 0.015, + "step": 28200 + }, + { + "epoch": 1.6903349511654384, + "grad_norm": 0.44267189502716064, + "learning_rate": 2.168866937895951e-06, + "loss": 0.0138, + "step": 28210 + }, + { + "epoch": 1.6909341482413565, + "grad_norm": 0.2974400818347931, + "learning_rate": 2.166992311459001e-06, + "loss": 0.0138, + "step": 28220 + }, + { + "epoch": 1.691533345317275, + "grad_norm": 0.30415791273117065, + "learning_rate": 2.1651280510485727e-06, + "loss": 0.0153, + "step": 28230 + }, + { + "epoch": 1.692132542393193, + "grad_norm": 0.3176470398902893, + "learning_rate": 2.163274158852476e-06, + "loss": 0.0117, + "step": 28240 + }, + { + "epoch": 1.6927317394691115, + "grad_norm": 0.4339515268802643, + "learning_rate": 2.1614306370463605e-06, + "loss": 0.0142, + "step": 28250 + }, + { + "epoch": 1.6933309365450295, + "grad_norm": 0.19751861691474915, + "learning_rate": 2.1595974877936977e-06, + "loss": 0.0141, + "step": 28260 + }, + { + "epoch": 1.693930133620948, + "grad_norm": 0.4287707507610321, + "learning_rate": 2.1577747132457933e-06, + "loss": 0.0125, + "step": 28270 + }, + { + "epoch": 1.694529330696866, + "grad_norm": 0.3044722378253937, + "learning_rate": 2.155962315541773e-06, + "loss": 0.0135, + "step": 28280 + }, + { + "epoch": 1.6951285277727846, + "grad_norm": 0.608513355255127, + "learning_rate": 2.154160296808588e-06, + "loss": 0.0142, + "step": 28290 + }, + { + "epoch": 1.6957277248487026, + "grad_norm": 0.48647579550743103, + "learning_rate": 2.1523686591610064e-06, + "loss": 0.0104, + "step": 28300 + }, + { + "epoch": 1.696326921924621, + "grad_norm": 0.2991415560245514, + "learning_rate": 2.1505874047016146e-06, + "loss": 0.0154, + "step": 28310 + }, + { + "epoch": 1.6969261190005391, + "grad_norm": 0.30744972825050354, + "learning_rate": 2.1488165355208147e-06, + "loss": 0.0132, + "step": 28320 + }, + { + "epoch": 1.6975253160764576, + "grad_norm": 0.31283605098724365, + "learning_rate": 2.14705605369682e-06, + "loss": 0.0134, + "step": 28330 + }, + { + "epoch": 1.6981245131523757, + "grad_norm": 0.31089895963668823, + "learning_rate": 2.145305961295655e-06, + "loss": 0.0114, + "step": 28340 + }, + { + "epoch": 1.6987237102282942, + "grad_norm": 0.33381298184394836, + "learning_rate": 2.143566260371149e-06, + "loss": 0.0119, + "step": 28350 + }, + { + "epoch": 1.6993229073042122, + "grad_norm": 0.3127349317073822, + "learning_rate": 2.141836952964938e-06, + "loss": 0.0123, + "step": 28360 + }, + { + "epoch": 1.6999221043801307, + "grad_norm": 0.2944924235343933, + "learning_rate": 2.1401180411064616e-06, + "loss": 0.0126, + "step": 28370 + }, + { + "epoch": 1.7005213014560487, + "grad_norm": 0.2658735513687134, + "learning_rate": 2.138409526812959e-06, + "loss": 0.0183, + "step": 28380 + }, + { + "epoch": 1.7011204985319672, + "grad_norm": 0.30414438247680664, + "learning_rate": 2.1367114120894663e-06, + "loss": 0.0155, + "step": 28390 + }, + { + "epoch": 1.7017196956078853, + "grad_norm": 0.2788392901420593, + "learning_rate": 2.1350236989288136e-06, + "loss": 0.0135, + "step": 28400 + }, + { + "epoch": 1.7023188926838038, + "grad_norm": 0.42318466305732727, + "learning_rate": 2.1333463893116294e-06, + "loss": 0.0145, + "step": 28410 + }, + { + "epoch": 1.7029180897597218, + "grad_norm": 0.3691503703594208, + "learning_rate": 2.131679485206329e-06, + "loss": 0.0153, + "step": 28420 + }, + { + "epoch": 1.7035172868356403, + "grad_norm": 0.39968568086624146, + "learning_rate": 2.130022988569117e-06, + "loss": 0.0112, + "step": 28430 + }, + { + "epoch": 1.7041164839115586, + "grad_norm": 0.5108732581138611, + "learning_rate": 2.128376901343984e-06, + "loss": 0.0121, + "step": 28440 + }, + { + "epoch": 1.7047156809874768, + "grad_norm": 0.1716325432062149, + "learning_rate": 2.1267412254627056e-06, + "loss": 0.0151, + "step": 28450 + }, + { + "epoch": 1.705314878063395, + "grad_norm": 0.21164365112781525, + "learning_rate": 2.1251159628448386e-06, + "loss": 0.0134, + "step": 28460 + }, + { + "epoch": 1.7059140751393134, + "grad_norm": 0.329767107963562, + "learning_rate": 2.1235011153977192e-06, + "loss": 0.0138, + "step": 28470 + }, + { + "epoch": 1.7065132722152316, + "grad_norm": 0.29405954480171204, + "learning_rate": 2.121896685016461e-06, + "loss": 0.0117, + "step": 28480 + }, + { + "epoch": 1.70711246929115, + "grad_norm": 0.3556554615497589, + "learning_rate": 2.1203026735839514e-06, + "loss": 0.0112, + "step": 28490 + }, + { + "epoch": 1.7077116663670682, + "grad_norm": 0.20903514325618744, + "learning_rate": 2.118719082970852e-06, + "loss": 0.015, + "step": 28500 + }, + { + "epoch": 1.7083108634429864, + "grad_norm": 0.3857610821723938, + "learning_rate": 2.1171459150355947e-06, + "loss": 0.0142, + "step": 28510 + }, + { + "epoch": 1.7089100605189047, + "grad_norm": 0.37805458903312683, + "learning_rate": 2.115583171624381e-06, + "loss": 0.0128, + "step": 28520 + }, + { + "epoch": 1.709509257594823, + "grad_norm": 0.31887349486351013, + "learning_rate": 2.114030854571176e-06, + "loss": 0.0147, + "step": 28530 + }, + { + "epoch": 1.7101084546707412, + "grad_norm": 0.21606838703155518, + "learning_rate": 2.1124889656977097e-06, + "loss": 0.0122, + "step": 28540 + }, + { + "epoch": 1.7107076517466595, + "grad_norm": 0.36150410771369934, + "learning_rate": 2.1109575068134756e-06, + "loss": 0.0155, + "step": 28550 + }, + { + "epoch": 1.7113068488225778, + "grad_norm": 0.41081342101097107, + "learning_rate": 2.1094364797157267e-06, + "loss": 0.0157, + "step": 28560 + }, + { + "epoch": 1.711906045898496, + "grad_norm": 0.30500170588493347, + "learning_rate": 2.107925886189472e-06, + "loss": 0.013, + "step": 28570 + }, + { + "epoch": 1.7125052429744143, + "grad_norm": 0.45380985736846924, + "learning_rate": 2.1064257280074763e-06, + "loss": 0.0141, + "step": 28580 + }, + { + "epoch": 1.7131044400503326, + "grad_norm": 0.3077009618282318, + "learning_rate": 2.1049360069302594e-06, + "loss": 0.0172, + "step": 28590 + }, + { + "epoch": 1.7137036371262508, + "grad_norm": 0.3113479018211365, + "learning_rate": 2.1034567247060926e-06, + "loss": 0.0151, + "step": 28600 + }, + { + "epoch": 1.7143028342021691, + "grad_norm": 0.4720151126384735, + "learning_rate": 2.1019878830709968e-06, + "loss": 0.0146, + "step": 28610 + }, + { + "epoch": 1.7149020312780874, + "grad_norm": 0.40217068791389465, + "learning_rate": 2.100529483748737e-06, + "loss": 0.0144, + "step": 28620 + }, + { + "epoch": 1.7155012283540056, + "grad_norm": 0.4546513557434082, + "learning_rate": 2.099081528450828e-06, + "loss": 0.0141, + "step": 28630 + }, + { + "epoch": 1.716100425429924, + "grad_norm": 0.4527282416820526, + "learning_rate": 2.097644018876524e-06, + "loss": 0.0112, + "step": 28640 + }, + { + "epoch": 1.7166996225058422, + "grad_norm": 0.34587305784225464, + "learning_rate": 2.096216956712826e-06, + "loss": 0.0108, + "step": 28650 + }, + { + "epoch": 1.7172988195817604, + "grad_norm": 0.37963685393333435, + "learning_rate": 2.0948003436344666e-06, + "loss": 0.0125, + "step": 28660 + }, + { + "epoch": 1.7178980166576787, + "grad_norm": 0.3598407208919525, + "learning_rate": 2.0933941813039244e-06, + "loss": 0.014, + "step": 28670 + }, + { + "epoch": 1.718497213733597, + "grad_norm": 0.40873903036117554, + "learning_rate": 2.091998471371406e-06, + "loss": 0.0123, + "step": 28680 + }, + { + "epoch": 1.7190964108095153, + "grad_norm": 0.27075979113578796, + "learning_rate": 2.0906132154748557e-06, + "loss": 0.0118, + "step": 28690 + }, + { + "epoch": 1.7196956078854335, + "grad_norm": 0.3349001109600067, + "learning_rate": 2.0892384152399504e-06, + "loss": 0.0163, + "step": 28700 + }, + { + "epoch": 1.7202948049613518, + "grad_norm": 0.2682032287120819, + "learning_rate": 2.0878740722800917e-06, + "loss": 0.0121, + "step": 28710 + }, + { + "epoch": 1.72089400203727, + "grad_norm": 0.45613598823547363, + "learning_rate": 2.086520188196413e-06, + "loss": 0.0139, + "step": 28720 + }, + { + "epoch": 1.7214931991131883, + "grad_norm": 0.4061899781227112, + "learning_rate": 2.085176764577774e-06, + "loss": 0.0167, + "step": 28730 + }, + { + "epoch": 1.7220923961891066, + "grad_norm": 0.24202635884284973, + "learning_rate": 2.083843803000755e-06, + "loss": 0.0158, + "step": 28740 + }, + { + "epoch": 1.7226915932650249, + "grad_norm": 0.44541120529174805, + "learning_rate": 2.0825213050296636e-06, + "loss": 0.0163, + "step": 28750 + }, + { + "epoch": 1.7232907903409431, + "grad_norm": 0.35003194212913513, + "learning_rate": 2.081209272216522e-06, + "loss": 0.0147, + "step": 28760 + }, + { + "epoch": 1.7238899874168614, + "grad_norm": 0.3613188564777374, + "learning_rate": 2.079907706101075e-06, + "loss": 0.0158, + "step": 28770 + }, + { + "epoch": 1.7244891844927797, + "grad_norm": 0.2081748992204666, + "learning_rate": 2.0786166082107833e-06, + "loss": 0.0119, + "step": 28780 + }, + { + "epoch": 1.725088381568698, + "grad_norm": 0.36700639128685, + "learning_rate": 2.0773359800608217e-06, + "loss": 0.0159, + "step": 28790 + }, + { + "epoch": 1.7256875786446162, + "grad_norm": 0.3384808599948883, + "learning_rate": 2.076065823154079e-06, + "loss": 0.0116, + "step": 28800 + }, + { + "epoch": 1.7262867757205345, + "grad_norm": 0.5698443055152893, + "learning_rate": 2.0748061389811543e-06, + "loss": 0.0135, + "step": 28810 + }, + { + "epoch": 1.7268859727964527, + "grad_norm": 0.22777511179447174, + "learning_rate": 2.073556929020357e-06, + "loss": 0.0136, + "step": 28820 + }, + { + "epoch": 1.727485169872371, + "grad_norm": 0.42319542169570923, + "learning_rate": 2.0723181947377057e-06, + "loss": 0.0138, + "step": 28830 + }, + { + "epoch": 1.7280843669482893, + "grad_norm": 0.48199185729026794, + "learning_rate": 2.0710899375869237e-06, + "loss": 0.0131, + "step": 28840 + }, + { + "epoch": 1.7286835640242075, + "grad_norm": 0.35982295870780945, + "learning_rate": 2.0698721590094387e-06, + "loss": 0.011, + "step": 28850 + }, + { + "epoch": 1.7292827611001258, + "grad_norm": 0.3580028712749481, + "learning_rate": 2.0686648604343824e-06, + "loss": 0.0177, + "step": 28860 + }, + { + "epoch": 1.729881958176044, + "grad_norm": 0.21845780313014984, + "learning_rate": 2.067468043278587e-06, + "loss": 0.0117, + "step": 28870 + }, + { + "epoch": 1.7304811552519623, + "grad_norm": 0.3009333908557892, + "learning_rate": 2.066281708946583e-06, + "loss": 0.0133, + "step": 28880 + }, + { + "epoch": 1.7310803523278806, + "grad_norm": 0.28064268827438354, + "learning_rate": 2.0651058588306007e-06, + "loss": 0.0118, + "step": 28890 + }, + { + "epoch": 1.7316795494037989, + "grad_norm": 0.2811881899833679, + "learning_rate": 2.063940494310565e-06, + "loss": 0.0127, + "step": 28900 + }, + { + "epoch": 1.7322787464797171, + "grad_norm": 0.25449663400650024, + "learning_rate": 2.062785616754097e-06, + "loss": 0.0152, + "step": 28910 + }, + { + "epoch": 1.7328779435556354, + "grad_norm": 0.41728776693344116, + "learning_rate": 2.0616412275165097e-06, + "loss": 0.0141, + "step": 28920 + }, + { + "epoch": 1.7334771406315537, + "grad_norm": 0.4925801753997803, + "learning_rate": 2.0605073279408063e-06, + "loss": 0.0114, + "step": 28930 + }, + { + "epoch": 1.734076337707472, + "grad_norm": 0.3441443145275116, + "learning_rate": 2.0593839193576833e-06, + "loss": 0.0175, + "step": 28940 + }, + { + "epoch": 1.7346755347833902, + "grad_norm": 0.598228931427002, + "learning_rate": 2.058271003085521e-06, + "loss": 0.0153, + "step": 28950 + }, + { + "epoch": 1.7352747318593085, + "grad_norm": 0.34356069564819336, + "learning_rate": 2.0571685804303905e-06, + "loss": 0.0126, + "step": 28960 + }, + { + "epoch": 1.7358739289352267, + "grad_norm": 0.2617851495742798, + "learning_rate": 2.0560766526860447e-06, + "loss": 0.0143, + "step": 28970 + }, + { + "epoch": 1.7364731260111452, + "grad_norm": 0.35475805401802063, + "learning_rate": 2.054995221133923e-06, + "loss": 0.0157, + "step": 28980 + }, + { + "epoch": 1.7370723230870633, + "grad_norm": 0.45460638403892517, + "learning_rate": 2.053924287043144e-06, + "loss": 0.0102, + "step": 28990 + }, + { + "epoch": 1.7376715201629818, + "grad_norm": 0.35972440242767334, + "learning_rate": 2.0528638516705106e-06, + "loss": 0.0137, + "step": 29000 + }, + { + "epoch": 1.7382707172388998, + "grad_norm": 0.3128221035003662, + "learning_rate": 2.051813916260501e-06, + "loss": 0.013, + "step": 29010 + }, + { + "epoch": 1.7388699143148183, + "grad_norm": 0.7588064670562744, + "learning_rate": 2.050774482045273e-06, + "loss": 0.0133, + "step": 29020 + }, + { + "epoch": 1.7394691113907363, + "grad_norm": 0.5074214935302734, + "learning_rate": 2.049745550244661e-06, + "loss": 0.016, + "step": 29030 + }, + { + "epoch": 1.7400683084666548, + "grad_norm": 0.48871514201164246, + "learning_rate": 2.0487271220661735e-06, + "loss": 0.0109, + "step": 29040 + }, + { + "epoch": 1.7406675055425729, + "grad_norm": 0.30255070328712463, + "learning_rate": 2.047719198704994e-06, + "loss": 0.0135, + "step": 29050 + }, + { + "epoch": 1.7412667026184914, + "grad_norm": 0.4563025236129761, + "learning_rate": 2.0467217813439762e-06, + "loss": 0.015, + "step": 29060 + }, + { + "epoch": 1.7418658996944094, + "grad_norm": 0.24640238285064697, + "learning_rate": 2.0457348711536426e-06, + "loss": 0.0137, + "step": 29070 + }, + { + "epoch": 1.742465096770328, + "grad_norm": 0.3724379241466522, + "learning_rate": 2.0447584692921894e-06, + "loss": 0.0141, + "step": 29080 + }, + { + "epoch": 1.743064293846246, + "grad_norm": 0.32838496565818787, + "learning_rate": 2.043792576905478e-06, + "loss": 0.0132, + "step": 29090 + }, + { + "epoch": 1.7436634909221644, + "grad_norm": 0.5715250968933105, + "learning_rate": 2.0428371951270394e-06, + "loss": 0.0125, + "step": 29100 + }, + { + "epoch": 1.7442626879980825, + "grad_norm": 0.29502353072166443, + "learning_rate": 2.0418923250780633e-06, + "loss": 0.0122, + "step": 29110 + }, + { + "epoch": 1.744861885074001, + "grad_norm": 0.2790152132511139, + "learning_rate": 2.0409579678674084e-06, + "loss": 0.0098, + "step": 29120 + }, + { + "epoch": 1.745461082149919, + "grad_norm": 0.9304683208465576, + "learning_rate": 2.040034124591597e-06, + "loss": 0.0146, + "step": 29130 + }, + { + "epoch": 1.7460602792258375, + "grad_norm": 0.26618465781211853, + "learning_rate": 2.039120796334809e-06, + "loss": 0.0128, + "step": 29140 + }, + { + "epoch": 1.7466594763017556, + "grad_norm": 0.28312423825263977, + "learning_rate": 2.0382179841688868e-06, + "loss": 0.0114, + "step": 29150 + }, + { + "epoch": 1.747258673377674, + "grad_norm": 0.30827805399894714, + "learning_rate": 2.0373256891533293e-06, + "loss": 0.0144, + "step": 29160 + }, + { + "epoch": 1.747857870453592, + "grad_norm": 0.29084426164627075, + "learning_rate": 2.0364439123352956e-06, + "loss": 0.0142, + "step": 29170 + }, + { + "epoch": 1.7484570675295106, + "grad_norm": 0.2825562655925751, + "learning_rate": 2.0355726547495998e-06, + "loss": 0.0129, + "step": 29180 + }, + { + "epoch": 1.7490562646054286, + "grad_norm": 0.5477129220962524, + "learning_rate": 2.034711917418711e-06, + "loss": 0.0149, + "step": 29190 + }, + { + "epoch": 1.7496554616813471, + "grad_norm": 0.27458444237709045, + "learning_rate": 2.033861701352752e-06, + "loss": 0.0131, + "step": 29200 + }, + { + "epoch": 1.7502546587572652, + "grad_norm": 0.5763506293296814, + "learning_rate": 2.0330220075494992e-06, + "loss": 0.012, + "step": 29210 + }, + { + "epoch": 1.7508538558331836, + "grad_norm": 0.29996973276138306, + "learning_rate": 2.0321928369943807e-06, + "loss": 0.0139, + "step": 29220 + }, + { + "epoch": 1.7514530529091017, + "grad_norm": 0.2447529435157776, + "learning_rate": 2.031374190660474e-06, + "loss": 0.0128, + "step": 29230 + }, + { + "epoch": 1.7520522499850202, + "grad_norm": 0.18921193480491638, + "learning_rate": 2.0305660695085054e-06, + "loss": 0.0132, + "step": 29240 + }, + { + "epoch": 1.7526514470609382, + "grad_norm": 0.35065901279449463, + "learning_rate": 2.0297684744868494e-06, + "loss": 0.0178, + "step": 29250 + }, + { + "epoch": 1.7532506441368567, + "grad_norm": 0.22698186337947845, + "learning_rate": 2.0289814065315306e-06, + "loss": 0.0131, + "step": 29260 + }, + { + "epoch": 1.7538498412127748, + "grad_norm": 0.7310769557952881, + "learning_rate": 2.0282048665662153e-06, + "loss": 0.0146, + "step": 29270 + }, + { + "epoch": 1.7544490382886933, + "grad_norm": 0.5522712469100952, + "learning_rate": 2.0274388555022176e-06, + "loss": 0.0135, + "step": 29280 + }, + { + "epoch": 1.7550482353646113, + "grad_norm": 0.29603326320648193, + "learning_rate": 2.0266833742384928e-06, + "loss": 0.016, + "step": 29290 + }, + { + "epoch": 1.7556474324405298, + "grad_norm": 0.3674398362636566, + "learning_rate": 2.0259384236616404e-06, + "loss": 0.0135, + "step": 29300 + }, + { + "epoch": 1.7562466295164478, + "grad_norm": 0.4478980302810669, + "learning_rate": 2.0252040046459022e-06, + "loss": 0.0178, + "step": 29310 + }, + { + "epoch": 1.7568458265923663, + "grad_norm": 0.32618647813796997, + "learning_rate": 2.02448011805316e-06, + "loss": 0.014, + "step": 29320 + }, + { + "epoch": 1.7574450236682844, + "grad_norm": 0.5377118587493896, + "learning_rate": 2.023766764732934e-06, + "loss": 0.0159, + "step": 29330 + }, + { + "epoch": 1.7580442207442029, + "grad_norm": 0.3777340352535248, + "learning_rate": 2.0230639455223853e-06, + "loss": 0.0143, + "step": 29340 + }, + { + "epoch": 1.758643417820121, + "grad_norm": 0.33518269658088684, + "learning_rate": 2.0223716612463095e-06, + "loss": 0.0149, + "step": 29350 + }, + { + "epoch": 1.7592426148960394, + "grad_norm": 0.3693374991416931, + "learning_rate": 2.0216899127171424e-06, + "loss": 0.0128, + "step": 29360 + }, + { + "epoch": 1.7598418119719574, + "grad_norm": 0.42809057235717773, + "learning_rate": 2.0210187007349534e-06, + "loss": 0.0168, + "step": 29370 + }, + { + "epoch": 1.760441009047876, + "grad_norm": 0.4278734028339386, + "learning_rate": 2.0203580260874474e-06, + "loss": 0.0125, + "step": 29380 + }, + { + "epoch": 1.761040206123794, + "grad_norm": 0.45604345202445984, + "learning_rate": 2.019707889549963e-06, + "loss": 0.0147, + "step": 29390 + }, + { + "epoch": 1.7616394031997125, + "grad_norm": 0.3464241921901703, + "learning_rate": 2.01906829188547e-06, + "loss": 0.015, + "step": 29400 + }, + { + "epoch": 1.7622386002756305, + "grad_norm": 0.28437861800193787, + "learning_rate": 2.018439233844574e-06, + "loss": 0.0138, + "step": 29410 + }, + { + "epoch": 1.762837797351549, + "grad_norm": 0.8128647208213806, + "learning_rate": 2.0178207161655087e-06, + "loss": 0.0146, + "step": 29420 + }, + { + "epoch": 1.763436994427467, + "grad_norm": 0.4243966341018677, + "learning_rate": 2.0172127395741398e-06, + "loss": 0.0138, + "step": 29430 + }, + { + "epoch": 1.7640361915033855, + "grad_norm": 0.23284584283828735, + "learning_rate": 2.0166153047839603e-06, + "loss": 0.0123, + "step": 29440 + }, + { + "epoch": 1.7646353885793036, + "grad_norm": 0.6289668083190918, + "learning_rate": 2.016028412496094e-06, + "loss": 0.0131, + "step": 29450 + }, + { + "epoch": 1.765234585655222, + "grad_norm": 0.26893526315689087, + "learning_rate": 2.015452063399292e-06, + "loss": 0.0144, + "step": 29460 + }, + { + "epoch": 1.7658337827311401, + "grad_norm": 0.31439170241355896, + "learning_rate": 2.014886258169932e-06, + "loss": 0.0105, + "step": 29470 + }, + { + "epoch": 1.7664329798070586, + "grad_norm": 0.3153708577156067, + "learning_rate": 2.014330997472017e-06, + "loss": 0.0118, + "step": 29480 + }, + { + "epoch": 1.7670321768829766, + "grad_norm": 0.25374165177345276, + "learning_rate": 2.013786281957177e-06, + "loss": 0.0143, + "step": 29490 + }, + { + "epoch": 1.7676313739588951, + "grad_norm": 0.43711739778518677, + "learning_rate": 2.0132521122646662e-06, + "loss": 0.0151, + "step": 29500 + }, + { + "epoch": 1.7682305710348134, + "grad_norm": 0.2920657992362976, + "learning_rate": 2.0127284890213623e-06, + "loss": 0.0142, + "step": 29510 + }, + { + "epoch": 1.7688297681107317, + "grad_norm": 0.45769479870796204, + "learning_rate": 2.012215412841767e-06, + "loss": 0.0118, + "step": 29520 + }, + { + "epoch": 1.76942896518665, + "grad_norm": 0.31419840455055237, + "learning_rate": 2.011712884328003e-06, + "loss": 0.0115, + "step": 29530 + }, + { + "epoch": 1.7700281622625682, + "grad_norm": 0.29443657398223877, + "learning_rate": 2.011220904069815e-06, + "loss": 0.0144, + "step": 29540 + }, + { + "epoch": 1.7706273593384865, + "grad_norm": 0.3117132782936096, + "learning_rate": 2.01073947264457e-06, + "loss": 0.0132, + "step": 29550 + }, + { + "epoch": 1.7712265564144047, + "grad_norm": 0.351385235786438, + "learning_rate": 2.0102685906172543e-06, + "loss": 0.0111, + "step": 29560 + }, + { + "epoch": 1.771825753490323, + "grad_norm": 0.27133694291114807, + "learning_rate": 2.009808258540475e-06, + "loss": 0.0154, + "step": 29570 + }, + { + "epoch": 1.7724249505662413, + "grad_norm": 0.30877798795700073, + "learning_rate": 2.009358476954456e-06, + "loss": 0.0093, + "step": 29580 + }, + { + "epoch": 1.7730241476421595, + "grad_norm": 0.2506785988807678, + "learning_rate": 2.008919246387043e-06, + "loss": 0.012, + "step": 29590 + }, + { + "epoch": 1.7736233447180778, + "grad_norm": 0.32467120885849, + "learning_rate": 2.0084905673536952e-06, + "loss": 0.0131, + "step": 29600 + }, + { + "epoch": 1.774222541793996, + "grad_norm": 0.22748734056949615, + "learning_rate": 2.0080724403574922e-06, + "loss": 0.0097, + "step": 29610 + }, + { + "epoch": 1.7748217388699143, + "grad_norm": 0.38346391916275024, + "learning_rate": 2.007664865889131e-06, + "loss": 0.014, + "step": 29620 + }, + { + "epoch": 1.7754209359458326, + "grad_norm": 0.296090304851532, + "learning_rate": 2.0072678444269208e-06, + "loss": 0.0145, + "step": 29630 + }, + { + "epoch": 1.7760201330217509, + "grad_norm": 0.2874438464641571, + "learning_rate": 2.006881376436789e-06, + "loss": 0.0133, + "step": 29640 + }, + { + "epoch": 1.7766193300976691, + "grad_norm": 0.2805752158164978, + "learning_rate": 2.0065054623722772e-06, + "loss": 0.0169, + "step": 29650 + }, + { + "epoch": 1.7772185271735874, + "grad_norm": 0.17779164016246796, + "learning_rate": 2.0061401026745425e-06, + "loss": 0.0118, + "step": 29660 + }, + { + "epoch": 1.7778177242495057, + "grad_norm": 0.316571444272995, + "learning_rate": 2.005785297772354e-06, + "loss": 0.0136, + "step": 29670 + }, + { + "epoch": 1.778416921325424, + "grad_norm": 0.8303540945053101, + "learning_rate": 2.005441048082095e-06, + "loss": 0.0172, + "step": 29680 + }, + { + "epoch": 1.7790161184013422, + "grad_norm": 0.3058635890483856, + "learning_rate": 2.0051073540077617e-06, + "loss": 0.0129, + "step": 29690 + }, + { + "epoch": 1.7796153154772605, + "grad_norm": 0.17514090240001678, + "learning_rate": 2.0047842159409633e-06, + "loss": 0.0099, + "step": 29700 + }, + { + "epoch": 1.7802145125531788, + "grad_norm": 0.22482258081436157, + "learning_rate": 2.004471634260919e-06, + "loss": 0.0131, + "step": 29710 + }, + { + "epoch": 1.780813709629097, + "grad_norm": 0.4026334881782532, + "learning_rate": 2.004169609334462e-06, + "loss": 0.0134, + "step": 29720 + }, + { + "epoch": 1.7814129067050153, + "grad_norm": 0.45236676931381226, + "learning_rate": 2.003878141516035e-06, + "loss": 0.0188, + "step": 29730 + }, + { + "epoch": 1.7820121037809336, + "grad_norm": 0.3150536119937897, + "learning_rate": 2.0035972311476916e-06, + "loss": 0.0126, + "step": 29740 + }, + { + "epoch": 1.7826113008568518, + "grad_norm": 0.9602782726287842, + "learning_rate": 2.0033268785590954e-06, + "loss": 0.0139, + "step": 29750 + }, + { + "epoch": 1.78321049793277, + "grad_norm": 0.2820151746273041, + "learning_rate": 2.003067084067522e-06, + "loss": 0.014, + "step": 29760 + }, + { + "epoch": 1.7838096950086884, + "grad_norm": 1.0188407897949219, + "learning_rate": 2.0028178479778523e-06, + "loss": 0.0137, + "step": 29770 + }, + { + "epoch": 1.7844088920846066, + "grad_norm": 0.26598837971687317, + "learning_rate": 2.0025791705825805e-06, + "loss": 0.0115, + "step": 29780 + }, + { + "epoch": 1.785008089160525, + "grad_norm": 0.2299095243215561, + "learning_rate": 2.0023510521618066e-06, + "loss": 0.0126, + "step": 29790 + }, + { + "epoch": 1.7856072862364432, + "grad_norm": 0.29679203033447266, + "learning_rate": 2.0021334929832407e-06, + "loss": 0.012, + "step": 29800 + }, + { + "epoch": 1.7862064833123614, + "grad_norm": 0.3352377116680145, + "learning_rate": 2.0019264933022016e-06, + "loss": 0.014, + "step": 29810 + }, + { + "epoch": 1.7868056803882797, + "grad_norm": 0.18228839337825775, + "learning_rate": 2.001730053361614e-06, + "loss": 0.0123, + "step": 29820 + }, + { + "epoch": 1.787404877464198, + "grad_norm": 0.5216359496116638, + "learning_rate": 2.0015441733920105e-06, + "loss": 0.0132, + "step": 29830 + }, + { + "epoch": 1.7880040745401162, + "grad_norm": 0.3130887746810913, + "learning_rate": 2.0013688536115332e-06, + "loss": 0.0168, + "step": 29840 + }, + { + "epoch": 1.7886032716160345, + "grad_norm": 0.4271252751350403, + "learning_rate": 2.0012040942259285e-06, + "loss": 0.0141, + "step": 29850 + }, + { + "epoch": 1.7892024686919528, + "grad_norm": 0.32060664892196655, + "learning_rate": 2.0010498954285506e-06, + "loss": 0.0134, + "step": 29860 + }, + { + "epoch": 1.789801665767871, + "grad_norm": 0.4360806345939636, + "learning_rate": 2.00090625740036e-06, + "loss": 0.0125, + "step": 29870 + }, + { + "epoch": 1.7904008628437893, + "grad_norm": 0.35824981331825256, + "learning_rate": 2.0007731803099256e-06, + "loss": 0.0129, + "step": 29880 + }, + { + "epoch": 1.7910000599197076, + "grad_norm": 0.37794366478919983, + "learning_rate": 2.00065066431342e-06, + "loss": 0.0151, + "step": 29890 + }, + { + "epoch": 1.7915992569956258, + "grad_norm": 0.302745521068573, + "learning_rate": 2.0005387095546222e-06, + "loss": 0.0142, + "step": 29900 + }, + { + "epoch": 1.792198454071544, + "grad_norm": 0.19773688912391663, + "learning_rate": 2.000437316164917e-06, + "loss": 0.0103, + "step": 29910 + }, + { + "epoch": 1.7927976511474624, + "grad_norm": 0.2933025658130646, + "learning_rate": 2.000346484263297e-06, + "loss": 0.0135, + "step": 29920 + }, + { + "epoch": 1.7933968482233806, + "grad_norm": 0.2572041451931, + "learning_rate": 2.0002662139563564e-06, + "loss": 0.0165, + "step": 29930 + }, + { + "epoch": 1.793996045299299, + "grad_norm": 0.6411796808242798, + "learning_rate": 2.0001965053382976e-06, + "loss": 0.0127, + "step": 29940 + }, + { + "epoch": 1.7945952423752172, + "grad_norm": 0.3087517321109772, + "learning_rate": 2.000137358490928e-06, + "loss": 0.0133, + "step": 29950 + }, + { + "epoch": 1.7951944394511354, + "grad_norm": 0.3539549708366394, + "learning_rate": 2.0000887734836583e-06, + "loss": 0.0117, + "step": 29960 + }, + { + "epoch": 1.7957936365270537, + "grad_norm": 0.3078557848930359, + "learning_rate": 2.0000507503735076e-06, + "loss": 0.0118, + "step": 29970 + }, + { + "epoch": 1.796392833602972, + "grad_norm": 0.44483524560928345, + "learning_rate": 2.0000232892050976e-06, + "loss": 0.0122, + "step": 29980 + }, + { + "epoch": 1.7969920306788902, + "grad_norm": 0.3110407888889313, + "learning_rate": 2.000006390010655e-06, + "loss": 0.0126, + "step": 29990 + }, + { + "epoch": 1.7975912277548085, + "grad_norm": 0.2597223222255707, + "learning_rate": 2.0000000528100118e-06, + "loss": 0.0138, + "step": 30000 + }, + { + "epoch": 1.7975912277548085, + "step": 30000, + "total_flos": 1.873893288742748e+17, + "train_loss": 0.03163320524344842, + "train_runtime": 19519.4366, + "train_samples_per_second": 12.295, + "train_steps_per_second": 1.537 + } + ], + "logging_steps": 10, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.873893288742748e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/training_args.bin b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..5fecc60b61aa66699566b01045633ce2fd4a6a74 --- /dev/null +++ b/libero_on_top_extra_pi0_VIS_PROJ_HEAD/libero_on_top_extra_pi0_20260201-161941_lr2e-05_batchsize8/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ad96fcc5212b0fb64af2ed9b5a1ad33dee0cea6a86c08271b39c38f4388a38a +size 6097